aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/lua-torch/nn/lib/THNN/generic
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/lua-torch/nn/lib/THNN/generic')
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/Abs.c28
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/AbsCriterion.c40
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/BCECriterion.c66
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/BatchNormalization.c149
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/ClassNLLCriterion.c163
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/DistKLDivCriterion.c44
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/ELU.c54
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/FusedRNNKernel.c55
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/GatedLinearUnit.c73
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/HardShrink.c42
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/HardTanh.c133
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/IndexLinear.c742
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/L1Cost.c38
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/LeakyReLU.c57
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/Linear.c114
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/LogSigmoid.c36
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/LogSoftMax.c137
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/LookupTable.c225
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/MSECriterion.c45
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/MarginCriterion.c47
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/MultiLabelMarginCriterion.c184
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/MultiMarginCriterion.c168
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/PReLU.c207
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/RReLU.c132
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/Sigmoid.c28
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/SmoothL1Criterion.c49
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/SoftMarginCriterion.c44
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/SoftMax.c150
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/SoftPlus.c47
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/SoftShrink.c42
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/SparseLinear.c564
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/SpatialAdaptiveAveragePooling.c258
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/SpatialAdaptiveMaxPooling.c274
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/SpatialAveragePooling.c329
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/SpatialClassNLLCriterion.c131
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/SpatialConvolutionLocal.c367
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/SpatialConvolutionMM.c377
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/SpatialConvolutionMap.c277
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/SpatialDepthWiseConvolution.c528
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/SpatialDilatedConvolution.c408
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/SpatialDilatedMaxPooling.c401
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/SpatialFractionalMaxPooling.c253
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/SpatialFullConvolution.c462
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/SpatialFullConvolutionMap.c222
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/SpatialMaxPooling.c44
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/SpatialMaxUnpooling.c234
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/SpatialReflectionPadding.c260
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/SpatialReplicationPadding.c260
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/SpatialSubSampling.c302
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/SpatialUpSamplingBilinear.c174
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/SpatialUpSamplingNearest.c199
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/Sqrt.c52
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/Square.c59
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/THNN.h1501
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/Tanh.c49
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/TemporalConvolution.c398
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/TemporalMaxPooling.c283
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/TemporalRowConvolution.c472
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/TemporalSubSampling.c156
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/Threshold.c64
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/VolumetricAveragePooling.c373
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/VolumetricConvolution.c260
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/VolumetricConvolutionMM.c628
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/VolumetricDilatedConvolution.c420
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/VolumetricDilatedMaxPooling.c515
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/VolumetricFractionalMaxPooling.c279
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/VolumetricFullConvolution.c541
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/VolumetricMaxPooling.c50
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/VolumetricMaxUnpooling.c373
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/VolumetricReplicationPadding.c357
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/VolumetricUpSamplingNearest.c226
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/VolumetricUpSamplingTrilinear.c213
-rw-r--r--contrib/lua-torch/nn/lib/THNN/generic/unfold.c166
73 files changed, 17098 insertions, 0 deletions
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/Abs.c b/contrib/lua-torch/nn/lib/THNN/generic/Abs.c
new file mode 100644
index 000000000..28721ec8e
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/Abs.c
@@ -0,0 +1,28 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Abs.c"
+#else
+
+void THNN_(Abs_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output)
+{
+ THTensor_(resizeAs)(output, input);
+ THTensor_(abs)(output, input);
+}
+
+void THNN_(Abs_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput)
+{
+ THNN_CHECK_NELEMENT(input, gradOutput);
+ THTensor_(resizeAs)(gradInput, input);
+ TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+ real z = *input_data;
+ *gradInput_data = *gradOutput_data * (z >= 0 ? 1 : -1);
+ );
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/AbsCriterion.c b/contrib/lua-torch/nn/lib/THNN/generic/AbsCriterion.c
new file mode 100644
index 000000000..9bee5de9e
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/AbsCriterion.c
@@ -0,0 +1,40 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/AbsCriterion.c"
+#else
+
+void THNN_(AbsCriterion_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *target,
+ THTensor *output,
+ bool sizeAverage)
+{
+ real sum = 0;
+ THNN_CHECK_NELEMENT(input, target);
+ TH_TENSOR_APPLY2(real, input, real, target,
+ sum += fabs(*input_data - *target_data);
+ );
+
+ if (sizeAverage)
+ sum /= THTensor_(nElement)(input);
+
+ THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(AbsCriterion_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *target,
+ THTensor *gradInput,
+ bool sizeAverage)
+{
+ THNN_CHECK_NELEMENT(input, target);
+ real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
+
+ THTensor_(resizeAs)(gradInput, input);
+ TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+ *gradInput_data = (*input_data - *target_data) >= 0 ? norm : -norm;
+ );
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/BCECriterion.c b/contrib/lua-torch/nn/lib/THNN/generic/BCECriterion.c
new file mode 100644
index 000000000..637a4067e
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/BCECriterion.c
@@ -0,0 +1,66 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/BCECriterion.c"
+#else
+
+#define EPS 1e-12
+
+void THNN_(BCECriterion_updateOutput)(THNNState *state, THTensor *input,
+ THTensor *target, THTensor *output,
+ bool sizeAverage, THTensor *weights)
+{
+ THNN_CHECK_NELEMENT(input, target);
+ THNN_CHECK_NELEMENT(input, weights);
+ THNN_CHECK_DIM_SIZE(output, 1, 0, 1);
+ real sum = 0;
+
+ if(weights)
+ TH_TENSOR_APPLY3(real, input, real, target, real, weights,
+ real x = *input_data;
+ real y = *target_data;
+ real w = *weights_data;
+ THAssertMsg(x >= 0. && x <= 1.,
+ "input value should be between 0~1, but got %f",
+ (double) x);
+ sum -= (log(x + EPS) * y + log(1. - x + EPS) * (1. - y)) * w;
+ )
+ else
+ TH_TENSOR_APPLY2(real, input, real, target,
+ real x = *input_data;
+ real y = *target_data;
+ THAssertMsg(x >= 0. && x <= 1.,
+ "input value should be between 0~1, but got %f",
+ (double) x);
+ sum -= log(x + EPS) * y + log(1. - x + EPS) * (1. - y);
+ );
+
+
+ if (sizeAverage)
+ sum /= THTensor_(nElement)(input);
+
+ THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(BCECriterion_updateGradInput)(THNNState *state, THTensor *input,
+ THTensor *target, THTensor *gradInput,
+ bool sizeAverage, THTensor *weights)
+{
+ THNN_CHECK_NELEMENT(input, target);
+ THNN_CHECK_NELEMENT(input, weights);
+
+ real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
+
+ THTensor_(resizeAs)(gradInput, input);
+
+ TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+ real x = *input_data;
+ real y = *target_data;
+ *gradInput_data = - norm * (y - x) / ((1. - x + EPS) * (x + EPS));
+ );
+
+ if(weights)
+ THTensor_(cmul)(gradInput, gradInput, weights);
+}
+
+#undef EPS
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/BatchNormalization.c b/contrib/lua-torch/nn/lib/THNN/generic/BatchNormalization.c
new file mode 100644
index 000000000..b8f462790
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/BatchNormalization.c
@@ -0,0 +1,149 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/BatchNormalization.c"
+#else
+
+void THNN_(BatchNormalization_updateOutput)(
+ THNNState *state, THTensor *input, THTensor *output,
+ THTensor *weight, THTensor *bias,
+ THTensor *running_mean, THTensor *running_var,
+ THTensor *save_mean, THTensor *save_std,
+ bool train, double momentum, double eps)
+{
+ THTensor_(resizeAs)(output, input);
+ long nInput = THTensor_(size)(input, 1);
+ long f;
+ ptrdiff_t n = THTensor_(nElement)(input) / nInput;
+
+ #pragma omp parallel for
+ for (f = 0; f < nInput; ++f) {
+ THTensor *in = THTensor_(newSelect)(input, 1, f);
+ THTensor *out = THTensor_(newSelect)(output, 1, f);
+
+ real mean, invstd;
+
+ if (train) {
+ // compute mean per input
+ accreal sum = 0;
+ TH_TENSOR_APPLY(real, in, sum += *in_data;);
+
+ mean = (real) sum / n;
+ THTensor_(set1d)(save_mean, f, (real) mean);
+
+ // compute variance per input
+ sum = 0;
+ TH_TENSOR_APPLY(real, in,
+ sum += (*in_data - mean) * (*in_data - mean););
+
+ if (sum == 0 && eps == 0.0) {
+ invstd = 0;
+ } else {
+ invstd = (real) (1 / sqrt(sum/n + eps));
+ }
+ THTensor_(set1d)(save_std, f, (real) invstd);
+
+ // update running averages
+ THTensor_(set1d)(running_mean, f,
+ (real) (momentum * mean + (1 - momentum) * THTensor_(get1d)(running_mean, f)));
+
+ accreal unbiased_var = sum / (n - 1);
+ THTensor_(set1d)(running_var, f,
+ (real) (momentum * unbiased_var + (1 - momentum) * THTensor_(get1d)(running_var, f)));
+ } else {
+ mean = THTensor_(get1d)(running_mean, f);
+ invstd = 1 / sqrt(THTensor_(get1d)(running_var, f) + eps);
+ }
+
+ // compute output
+ real w = weight ? THTensor_(get1d)(weight, f) : 1;
+ real b = bias ? THTensor_(get1d)(bias, f) : 0;
+
+ TH_TENSOR_APPLY2(real, in, real, out,
+ *out_data = (real) (((*in_data - mean) * invstd) * w + b););
+
+ THTensor_(free)(out);
+ THTensor_(free)(in);
+ }
+}
+
+void THNN_(BatchNormalization_backward)(
+ THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput,
+ THTensor *gradWeight, THTensor *gradBias, THTensor *weight,
+ THTensor *running_mean, THTensor *running_var,
+ THTensor *save_mean, THTensor *save_std,
+ bool train, double scale, double eps)
+{
+ THNN_CHECK_SHAPE(input, gradOutput);
+ long nInput = THTensor_(size)(input, 1);
+ long f;
+ ptrdiff_t n = THTensor_(nElement)(input) / nInput;
+
+ #pragma omp parallel for
+ for (f = 0; f < nInput; ++f) {
+ THTensor *in = THTensor_(newSelect)(input, 1, f);
+ THTensor *gradOut = THTensor_(newSelect)(gradOutput, 1, f);
+ real w = weight ? THTensor_(get1d)(weight, f) : 1;
+ real mean, invstd;
+ if (train) {
+ mean = THTensor_(get1d)(save_mean, f);
+ invstd = THTensor_(get1d)(save_std, f);
+ } else {
+ mean = THTensor_(get1d)(running_mean, f);
+ invstd = 1 / sqrt(THTensor_(get1d)(running_var, f) + eps);
+ }
+
+ // sum over all gradOutput in feature plane
+ accreal sum = 0;
+ TH_TENSOR_APPLY(real, gradOut, sum += *gradOut_data;);
+
+ // dot product of the Q(X) and gradOuput
+ accreal dotp = 0;
+ TH_TENSOR_APPLY2(real, in, real, gradOut,
+ dotp += (*in_data - mean) * (*gradOut_data););
+
+ if (gradInput) {
+ THTensor_(resizeAs)(gradInput, input);
+ THTensor *gradIn = THTensor_(newSelect)(gradInput, 1, f);
+
+ if (train) {
+ // when in training mode
+ // Q(X) = X - E[x] ; i.e. input centered to zero mean
+ // Y = Q(X) / σ ; i.e. BN output before weight and bias
+ // dL/dX = (Q(dL/dY) - dot(Y, dL/dY) * Y) / σ * w
+
+ // projection of gradOutput on to output scaled by std
+ real k = (real) dotp * invstd * invstd / n;
+ TH_TENSOR_APPLY2(real, gradIn, real, in,
+ *gradIn_data = (*in_data - mean) * k;);
+
+ accreal gradMean = sum / n;
+ TH_TENSOR_APPLY2(real, gradIn, real, gradOut,
+ *gradIn_data = (*gradOut_data - gradMean - *gradIn_data) * invstd * w;);
+
+ } else {
+ // when in evaluation mode
+ // Q(X) = X - running_mean ; i.e. input centered to zero mean
+ // Y = Q(X) / running_std ; i.e. BN output before weight and bias
+ // dL/dX = w / running_std
+ TH_TENSOR_APPLY2(real, gradIn, real, gradOut,
+ *gradIn_data = *gradOut_data * invstd * w;);
+ }
+
+ THTensor_(free)(gradIn);
+ }
+
+ if (gradWeight) {
+ real val = THTensor_(get1d)(gradWeight, f);
+ THTensor_(set1d)(gradWeight, f, val + scale * dotp * invstd);
+ }
+
+ if (gradBias) {
+ real val = THTensor_(get1d)(gradBias, f);
+ THTensor_(set1d)(gradBias, f, val + scale * sum);
+ }
+
+ THTensor_(free)(gradOut);
+ THTensor_(free)(in);
+ }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/ClassNLLCriterion.c b/contrib/lua-torch/nn/lib/THNN/generic/ClassNLLCriterion.c
new file mode 100644
index 000000000..4cf37aeaf
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/ClassNLLCriterion.c
@@ -0,0 +1,163 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/ClassNLLCriterion.c"
+#else
+
+void THNN_(ClassNLLCriterion_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THIndexTensor *target,
+ THTensor *output,
+ bool sizeAverage,
+ THTensor *weights,
+ THTensor *total_weight,
+ long ignore_index)
+{
+ THNN_CHECK_DIM_SIZE(output, 1, 0, 1);
+ THNN_CHECK_DIM_SIZE(total_weight, 1, 0, 1);
+ int n_dims = THTensor_(nDimension)(input);
+ int n_classes = THTensor_(size)(input, n_dims - 1);
+ ignore_index -= TH_INDEX_BASE;
+
+ if (THIndexTensor_(nDimension)(target) > 1) {
+ THError("multi-target not supported");
+ }
+ if (THTensor_(nDimension)(input) > 2) {
+ THError("input tensor should be 1D or 2D");
+ }
+ if (weights && THTensor_(nElement)(weights) != n_classes) {
+ THDescBuff s1 = THTensor_(sizeDesc)(weights);
+ THError("weight tensor should be defined either for all %d classes or no classes"
+ " but got weight tensor of shape: %s", n_classes, s1.str);
+ }
+
+ input = THTensor_(newContiguous)(input);
+ target = THIndexTensor_(newContiguous)(target);
+ weights = weights ? THTensor_(newContiguous)(weights) : NULL;
+
+ real *input_data = THTensor_(data)(input);
+ THIndex_t *target_data = THIndexTensor_(data)(target);
+ real *weights_data = weights ? THTensor_(data)(weights) : NULL;
+ real *output_data = THTensor_(data)(output);
+ real *total_weight_data = THTensor_(data)(total_weight);
+
+ output_data[0] = total_weight_data[0] = 0.0;
+
+ if (THTensor_(nDimension)(input) == 1) {
+ int cur_target = target_data[0] - TH_INDEX_BASE;
+ if (cur_target != ignore_index) {
+ THAssert(cur_target >= 0 && cur_target < n_classes);
+ total_weight_data[0] = weights ? weights_data[cur_target] : 1.0f;
+ output_data[0] = -input_data[cur_target] * total_weight_data[0];
+ }
+ } else if (THTensor_(nDimension)(input) == 2) {
+ int batch_size = THTensor_(size)(input, 0);
+ THAssert(THIndexTensor_(size)(target, 0) == batch_size);
+
+ int n_target = THTensor_(size)(input, 1);
+
+ int i;
+ for (i = 0; i < batch_size; i++) {
+ int cur_target = target_data[i] - TH_INDEX_BASE;
+ if (cur_target != ignore_index) {
+ THAssert(cur_target >= 0 && cur_target < n_classes);
+
+ real cur_weight = weights ? weights_data[cur_target] : 1.0f;
+ total_weight_data[0] += cur_weight;
+ output_data[0] -= input_data[i * n_target + cur_target] * cur_weight;
+ }
+ }
+ }
+
+ if (sizeAverage && total_weight_data[0]) {
+ output_data[0] /= total_weight_data[0];
+ }
+
+ if (weights) {
+ THTensor_(free)(weights);
+ }
+ THTensor_(free)(input);
+ THIndexTensor_(free)(target);
+}
+
+void THNN_(ClassNLLCriterion_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THIndexTensor *target,
+ THTensor *gradInput,
+ bool sizeAverage,
+ THTensor *weights,
+ THTensor *total_weight,
+ long ignore_index)
+{
+ int n_dims = THTensor_(nDimension)(input);
+ int n_classes = THTensor_(size)(input, n_dims - 1);
+ ignore_index -= TH_INDEX_BASE;
+
+ if (!THTensor_(isContiguous)(gradInput)) {
+ THError("gradInput must be contiguous");
+ }
+
+ real *total_weight_data = THTensor_(data)(total_weight);
+
+ if (!(*total_weight_data > 0)) {
+ return;
+ }
+
+ if (THIndexTensor_(nDimension)(target) > 1) {
+ THError("multi-target not supported");
+ }
+
+ if (THTensor_(nDimension)(input) > 2) {
+ THError("input tensor should be 1D or 2D");
+ }
+
+ if (weights && THTensor_(nElement)(weights) != n_classes) {
+ THError("weight tensor should be defined either for all or no classes");
+ }
+
+ target = THIndexTensor_(newContiguous)(target);
+ weights = weights ? THTensor_(newContiguous)(weights) : NULL;
+
+ THIndex_t *target_data = THIndexTensor_(data)(target);
+ real *weights_data = weights ? THTensor_(data)(weights) : NULL;
+ real *gradInput_data = THTensor_(data)(gradInput);
+
+ if (THTensor_(nDimension)(input) == 1) {
+ int cur_target = target_data[0] - TH_INDEX_BASE;
+ if (cur_target != ignore_index) {
+ THAssert(cur_target >= 0 && cur_target < n_classes);
+
+ gradInput_data[cur_target] =
+ (!sizeAverage && weights) ? -weights_data[cur_target] : -1;
+ }
+
+ } else if (THTensor_(nDimension)(input) == 2) {
+ int batch_size = THTensor_(size)(input, 0);
+ THAssert(THIndexTensor_(size)(target, 0) == batch_size);
+
+ int n_target = THTensor_(size)(input, 1);
+
+ int i;
+ for (i = 0; i < batch_size; i++){
+ int cur_target = target_data[i] - TH_INDEX_BASE;
+
+ if (cur_target != ignore_index) {
+ THAssert(cur_target >= 0 && cur_target < n_classes);
+
+ gradInput_data[i * n_target + cur_target] =
+ -(weights ? weights_data[cur_target] : 1.0f);
+
+ if (sizeAverage && *total_weight_data) {
+ gradInput_data[i * n_target + cur_target] /= *total_weight_data;
+ }
+ }
+ }
+ }
+
+ THIndexTensor_(free)(target);
+ if (weights) {
+ THTensor_(free)(weights);
+ }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/DistKLDivCriterion.c b/contrib/lua-torch/nn/lib/THNN/generic/DistKLDivCriterion.c
new file mode 100644
index 000000000..6bd6aa067
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/DistKLDivCriterion.c
@@ -0,0 +1,44 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/DistKLDivCriterion.c"
+#else
+
+void THNN_(DistKLDivCriterion_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *target,
+ THTensor *output,
+ bool sizeAverage)
+{
+ THNN_CHECK_NELEMENT(input, target);
+ THNN_CHECK_DIM_SIZE(output, 1, 0, 1);
+
+ real sum = 0;
+
+ TH_TENSOR_APPLY2(real, input, real, target,
+ sum += *target_data > 0 ? *target_data * (log(*target_data) - *input_data) : 0;
+ );
+
+ if (sizeAverage)
+ sum /= THTensor_(nElement)(input);
+
+ THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(DistKLDivCriterion_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *target,
+ THTensor *gradInput,
+ bool sizeAverage)
+{
+ THNN_CHECK_NELEMENT(input, target);
+
+ real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
+
+ THTensor_(resizeAs)(gradInput, input);
+ TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+ *gradInput_data = *target_data > 0 ? norm * (-*target_data) : 0;
+ );
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/ELU.c b/contrib/lua-torch/nn/lib/THNN/generic/ELU.c
new file mode 100644
index 000000000..ddcfb9705
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/ELU.c
@@ -0,0 +1,54 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/ELU.c"
+#else
+
+void THNN_(ELU_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ accreal alpha_,
+ bool inplace)
+{
+ real alpha = TH_CONVERT_ACCREAL_TO_REAL(alpha_);
+ if(inplace) {
+ TH_TENSOR_APPLY(real, input,
+ if(*input_data <= 0) {
+ *input_data = (exp(*input_data) - 1) * alpha;
+ }
+ );
+ THTensor_(set)(output, input);
+ } else {
+ THTensor_(resizeAs)(output, input);
+ TH_TENSOR_APPLY2(real, input, real, output,
+ *output_data = *input_data <= 0 ? (exp(*input_data)-1)*alpha : *input_data;
+ );
+ }
+}
+
+void THNN_(ELU_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *output,
+ accreal alpha_,
+ bool inplace)
+{
+ real alpha = TH_CONVERT_ACCREAL_TO_REAL(alpha_);
+ THNN_CHECK_NELEMENT(input, gradOutput);
+ if(inplace) {
+ TH_TENSOR_APPLY2(real, gradOutput, real, output,
+ if(*output_data <= 0) {
+ *gradOutput_data *= *output_data + alpha;
+ }
+ );
+ THTensor_(set)(gradInput, gradOutput);
+ } else {
+ THTensor_(resizeAs)(gradInput, output);
+ TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
+ *gradInput_data = *output_data <= 0 ? *gradOutput_data * (*output_data + alpha) : *gradOutput_data;
+ );
+ }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/FusedRNNKernel.c b/contrib/lua-torch/nn/lib/THNN/generic/FusedRNNKernel.c
new file mode 100644
index 000000000..30788b0a2
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/FusedRNNKernel.c
@@ -0,0 +1,55 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/FusedRNNKernel.c"
+#else
+
+void THNN_(GRUFused_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *hidden,
+ THTensor *bias1,
+ THTensor *bias2,
+ THTensor *hx,
+ THTensor *hy,
+ THTensor *storage)
+{
+ THAssertMsg(false, "Not implemented for CPU");
+}
+
+void THNN_(GRUFused_updateGradInput)(
+ THNNState *state,
+ THTensor *gradInInput,
+ THTensor *gradInHidden,
+ THTensor *gradOutput,
+ THTensor *gradInputHx,
+ THTensor *storage)
+{
+ THAssertMsg(false, "Not implemented for CPU");
+}
+
+void THNN_(LSTMFused_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *hidden,
+ THTensor *bias1,
+ THTensor *bias2,
+ THTensor *cx,
+ THTensor *hy,
+ THTensor *cy)
+{
+ THAssertMsg(false, "Not implemented for CPU");
+}
+
+void THNN_(LSTMFused_updateGradInput)(
+ THNNState *state,
+ THTensor *storage,
+ THTensor *gradInGates,
+ THTensor *prevC,
+ THTensor *cy,
+ THTensor *gradOutput,
+ THTensor *gradOutputCell,
+ THTensor *gradInputCx)
+{
+ THAssertMsg(false, "Not implemented for CPU");
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/GatedLinearUnit.c b/contrib/lua-torch/nn/lib/THNN/generic/GatedLinearUnit.c
new file mode 100644
index 000000000..274a27e3b
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/GatedLinearUnit.c
@@ -0,0 +1,73 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/GatedLinearUnit.c"
+#else
+
+void THNN_(GatedLinear_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ int dim)
+{
+ // size output to half of input
+ dim = dim - TH_INDEX_BASE;
+ const long nIn = THTensor_(size)(input, dim);
+ THArgCheck(nIn % 2 == 0, 2, "Halving dimension must be even. Dim %d is size %ld",
+ dim + TH_INDEX_BASE, nIn);
+
+ const long inputSize = THTensor_(size)(input, dim) / 2;
+ THLongStorage *newSizes = THTensor_(newSizeOf)(input);
+ THLongStorage_set(newSizes, dim, inputSize);
+ THTensor_(resize)(output, newSizes, NULL);
+
+ // halve tensor
+ THTensor *firstHalf = THTensor_(newNarrow)(input, dim, 0, inputSize);
+ THTensor *secondHalf = THTensor_(newNarrow)(input, dim, inputSize, inputSize);
+
+ // x = x1:cmul( sigmoid(x2) )
+ THTensor_(sigmoid)(output, secondHalf);
+ THTensor_(cmul)(output, output, firstHalf);
+
+ THLongStorage_free(newSizes);
+ THTensor_(free)(firstHalf);
+ THTensor_(free)(secondHalf);
+}
+
+void THNN_(GatedLinear_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ int dim)
+{
+ // set up tensors
+ dim = dim - TH_INDEX_BASE;
+ const long nIn = THTensor_(size)(input, dim);
+ THArgCheck(nIn % 2 == 0, 2, "Halving dimension must be even. Dim %d is size %ld",
+ dim + TH_INDEX_BASE, nIn);
+
+ THTensor_(resizeAs)(gradInput, input);
+ const long inputSize = THTensor_(size)(input, dim) / 2;
+ THTensor *firstHalf = THTensor_(newNarrow)(input, dim, 0, inputSize);
+ THTensor *secondHalf = THTensor_(newNarrow)(input, dim, inputSize, inputSize);
+ THTensor *gradInputfirstHalf = THTensor_(newNarrow)(gradInput, dim, 0, inputSize);
+ THTensor *gradInputsecondHalf = THTensor_(newNarrow)(gradInput, dim, inputSize, inputSize);
+
+ THTensor_(sigmoid)(gradInputfirstHalf, secondHalf);
+
+ TH_TENSOR_APPLY2(real, gradInputsecondHalf, real, gradInputfirstHalf,
+ real z = *gradInputfirstHalf_data;
+ *gradInputsecondHalf_data = (1. - z) * z;
+ );
+
+ THTensor_(cmul)(gradInputfirstHalf, gradInputfirstHalf, gradOutput);
+
+ THTensor_(cmul)(gradInputsecondHalf, gradInputsecondHalf, gradOutput);
+ THTensor_(cmul)(gradInputsecondHalf, gradInputsecondHalf, firstHalf);
+
+ THTensor_(free)(firstHalf);
+ THTensor_(free)(secondHalf);
+ THTensor_(free)(gradInputfirstHalf);
+ THTensor_(free)(gradInputsecondHalf);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/HardShrink.c b/contrib/lua-torch/nn/lib/THNN/generic/HardShrink.c
new file mode 100644
index 000000000..aaae85bac
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/HardShrink.c
@@ -0,0 +1,42 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/HardShrink.c"
+#else
+
+void THNN_(HardShrink_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ accreal lambda_)
+{
+ real lambda = TH_CONVERT_ACCREAL_TO_REAL(lambda_);
+ THTensor_(resizeAs)(output, input);
+
+ TH_TENSOR_APPLY2(real, output, real, input,
+ if (*input_data > lambda)
+ *output_data = *input_data;
+ else if (*input_data < -lambda)
+ *output_data = *input_data;
+ else
+ *output_data = 0;
+ );
+}
+
+void THNN_(HardShrink_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ accreal lambda_)
+{
+ real lambda = TH_CONVERT_ACCREAL_TO_REAL(lambda_);
+ THNN_CHECK_NELEMENT(input, gradOutput);
+ THTensor_(resizeAs)(gradInput, input);
+ TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+ if (*input_data > lambda || *input_data < -lambda)
+ *gradInput_data = *gradOutput_data;
+ else
+ *gradInput_data = 0;
+ );
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/HardTanh.c b/contrib/lua-torch/nn/lib/THNN/generic/HardTanh.c
new file mode 100644
index 000000000..589a66e15
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/HardTanh.c
@@ -0,0 +1,133 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/HardTanh.c"
+#else
+
+void THNN_(HardTanh_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ accreal min_val_,
+ accreal max_val_,
+ bool inplace)
+{
+ real min_val = TH_CONVERT_ACCREAL_TO_REAL(min_val_);
+ real max_val = TH_CONVERT_ACCREAL_TO_REAL(max_val_);
+ if (inplace)
+ THTensor_(set)(output, input);
+ else
+ THTensor_(resizeAs)(output, input);
+
+ if (input->nDimension == 1 || !THTensor_(isContiguous)(input) || !THTensor_(isContiguous)(output))
+ {
+ if (inplace)
+ TH_TENSOR_APPLY(real, input,
+ if (*input_data < min_val)
+ *input_data = min_val;
+ else if (*input_data > max_val)
+ *input_data = max_val;
+ );
+ TH_TENSOR_APPLY2(real, output, real, input,
+ if (*input_data < min_val)
+ *output_data = min_val;
+ else if (*input_data <= max_val)
+ *output_data = *input_data;
+ else
+ *output_data = max_val;
+ );
+ }
+ else
+ {
+ real* ptr_input = THTensor_(data)(input);
+ real* ptr_output = THTensor_(data)(output);
+ ptrdiff_t i;
+ ptrdiff_t n = THTensor_(nElement)(input);
+
+ if (inplace)
+#pragma omp parallel for private(i)
+ for (i = 0; i < n; i++)
+ {
+ if (ptr_input[i] < min_val)
+ ptr_input[i] = min_val;
+ else if (ptr_input[i] > max_val)
+ ptr_input[i] = max_val;
+ }
+ else
+#pragma omp parallel for private(i)
+ for (i = 0; i < n; i++)
+ {
+ if (ptr_input[i] < min_val)
+ ptr_output[i] = min_val;
+ else if (ptr_input[i] <= max_val)
+ ptr_output[i] = ptr_input[i];
+ else
+ ptr_output[i] = max_val;
+ }
+ }
+}
+
+void THNN_(HardTanh_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ accreal min_val_,
+ accreal max_val_,
+ bool inplace)
+{
+ real min_val = TH_CONVERT_ACCREAL_TO_REAL(min_val_);
+ real max_val = TH_CONVERT_ACCREAL_TO_REAL(max_val_);
+
+ THNN_CHECK_NELEMENT(input, gradOutput);
+ if (inplace)
+ THTensor_(set)(gradInput, gradOutput);
+ else
+ THTensor_(resizeAs)(gradInput, input);
+
+ if (input->nDimension == 1 ||
+ !THTensor_(isContiguous)(input) ||
+ !THTensor_(isContiguous)(gradOutput) ||
+ !THTensor_(isContiguous)(gradInput))
+ {
+ if (inplace)
+ {
+ TH_TENSOR_APPLY2(real, gradOutput, real, input,
+ if (*input_data <= min_val || *input_data >= max_val)
+ *gradOutput_data = 0;
+ );
+ }
+ else
+ TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+ if (*input_data <= min_val || *input_data >= max_val)
+ *gradInput_data = 0;
+ else
+ *gradInput_data = *gradOutput_data;
+ );
+ }
+ else
+ {
+ real* ptr_gradOutput = THTensor_(data)(gradOutput);
+ real* ptr_gradInput = THTensor_(data)(gradInput);
+ real* ptr_input = THTensor_(data)(input);
+ ptrdiff_t i;
+ ptrdiff_t n = THTensor_(nElement)(input);
+
+ if (inplace)
+#pragma omp parallel for private(i)
+ for (i = 0; i < n; i++)
+ {
+ if (ptr_input[i] <= min_val || ptr_input[i] >= max_val)
+ ptr_gradInput[i] = 0;
+ }
+ else
+#pragma omp parallel for private(i)
+ for (i = 0; i < n; i++)
+ {
+ if (ptr_input[i] <= min_val || ptr_input[i] >= max_val)
+ ptr_gradInput[i] = 0;
+ else
+ ptr_gradInput[i] = ptr_gradOutput[i];
+ }
+ }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/IndexLinear.c b/contrib/lua-torch/nn/lib/THNN/generic/IndexLinear.c
new file mode 100644
index 000000000..42d8368ba
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/IndexLinear.c
@@ -0,0 +1,742 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/IndexLinear.c"
+#else
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+/* Threshold used to trigger multithreading */
+#ifndef THNN_SPARSE_OMP_THRESHOLD
+#define THNN_SPARSE_OMP_THRESHOLD 100000
+#endif
+
+/* Threshold used to trigger BLAS axpy call */
+#ifndef THNN_SPARSE_OUTDIM_THRESHOLD
+#define THNN_SPARSE_OUTDIM_THRESHOLD 49
+#endif
+
+/* sign MACRO */
+#ifndef THNN_INDEXLINEAR_SIGN
+#define THNN_INDEXLINEAR_SIGN(a) ( ( (a) < 0 ) ? -1 : ( (a) > 0 ) )
+#endif
+
+static bool THNN_(checkKeysValues)(THLongTensor* keys, THTensor* values)
+{
+ return THLongTensor_size(keys, 0) == THTensor_(nElement)(values)
+ && THTensor_(nDimension)(values) == 1
+ && THLongTensor_nDimension(keys) == 1;
+}
+
+void THNN_(IndexLinear_updateOutput)(
+ THNNState *state,
+ THLongTensor *keys,
+ long keysOffset,
+ THTensor *values,
+ THLongTensor *sizes,
+ THLongTensor *cumSumSizes,
+ THTensor *output,
+ THTensor *weight,
+ THTensor *bias,
+ THTensor *normalizedValues,
+ int train)
+{
+ /* Retrieve all the dimensions of the problem */
+ long batchSize = THLongTensor_size(sizes, 0);
+ long keysSize = THLongTensor_size(keys, 0);
+ long outDim = THTensor_(size)(bias, 0);
+ long woutDim = THTensor_(size)(weight, 1);
+ int maxNormalize = woutDim - outDim;
+ long* sizesData = THLongTensor_data(sizes);
+ long* cumSumSizesData = THLongTensor_data(cumSumSizes);
+
+ /* Define/resize the normalized values tensor if maxNormalize is > 0 */
+ real* normalizedValuesData = NULL;
+ if (maxNormalize)
+ {
+ THTensor_(resize1d)(normalizedValues, keysSize);
+ normalizedValuesData = THTensor_(data)(normalizedValues);
+ }
+
+ /* Resize the output */
+ THTensor_(resize2d)(output, batchSize, outDim);
+
+ /* Access the storage data/strides */
+ real* outputData = THTensor_(data)(output);
+ real* valuesData = THTensor_(data)(values);
+ real* weightData = THTensor_(data)(weight);
+ long weightStride0 = weight->stride[0];
+ real* biasData = THTensor_(data)(bias);
+ long* keysData = THLongTensor_data(keys);
+
+ /* Make sure these inputs are contiguous to accelerate computations */
+ THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous");
+ THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous");
+ THArgCheck(THTensor_(isContiguous)(output), 6, "output vector must be contiguous");
+ THArgCheck(THTensor_(isContiguous)(weight), 7, "weight matrix must be contiguous");
+ THArgCheck(THTensor_(isContiguous)(bias), 8, "bias vector must be contiguous");
+ THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements");
+ THArgCheck(THTensor_(isContiguous)(normalizedValues), 9, "normalizedValues vector must be contiguous");
+ long i,j,k;
+
+ /* Separate cases: output dimension is == 1, or > 1
+ * This allows for some optimizations. */
+ if (outDim == 1)
+ {
+ THVector_(fill)(outputData, *biasData, batchSize);
+ if (maxNormalize)
+ {
+ /* Parallelize on the batch itself */
+#pragma omp parallel \
+ for private(i,j) \
+ firstprivate(outDim, keysOffset, \
+ weightData, keysData, \
+ valuesData, outputData, \
+ cumSumSizesData, sizesData) \
+ schedule(static) \
+ if(keysSize*outDim > THNN_SPARSE_OMP_THRESHOLD && batchSize > 1)
+ for (j = 0; j < batchSize; j++)
+ {
+ real* loutputData = outputData + j;
+ real val = 0;
+ real absVal = 0;
+ long offset = j == 0 ? 0 : cumSumSizesData[j - 1];
+
+ for (i = 0; i < sizesData[j]; i++)
+ {
+ long woffset = weightStride0*(keysData[offset] + keysOffset);
+ absVal = fabs(valuesData[offset]);
+ if (train)
+ {
+ if (absVal > weightData[woffset])
+ {
+ weightData[woffset] = absVal;
+ weightData[woffset+1] = 1/absVal;
+ }
+
+ /*
+ * The following can be used to scale the size of the updates
+ * depending on some rule, e.g. the frequency of a feature, ...
+ * This is used at update time.
+ * TODO: implement a smarter update scale.
+ */
+ weightData[woffset+2] = 1;
+ }
+ normalizedValuesData[offset] = (absVal > weightData[woffset] ? THNN_INDEXLINEAR_SIGN(valuesData[offset]):valuesData[offset]*weightData[woffset+1]) + weightData[woffset+3];
+ val += normalizedValuesData[offset] * weightData[woffset+maxNormalize];
+ offset++;
+ }
+ *loutputData += val;
+ }
+ }
+ else
+ {
+ /* Parallelize on the batch itself */
+#pragma omp parallel \
+ for private(i,j) \
+ firstprivate(outDim, weightData, \
+ keysData, valuesData, \
+ outputData, cumSumSizesData, \
+ sizesData) \
+ schedule(static) \
+ if(keysSize*outDim > THNN_SPARSE_OMP_THRESHOLD && batchSize > 1)
+ for (j = 0; j < batchSize; j++)
+ {
+ long offset = j == 0 ? 0 : cumSumSizesData[j - 1];
+ real* loutputData = outputData + j;
+ real val = 0;
+
+ for (i = 0; i < sizesData[j]; i++)
+ {
+ val += weightData[weightStride0*(keysData[offset] + keysOffset)] * valuesData[offset];
+ offset++;
+ }
+ *loutputData += val;
+ }
+ }
+ }
+ else {
+#pragma omp parallel \
+ for private(i,j,k) \
+ firstprivate(outDim, weightData, \
+ keysData, valuesData, \
+ biasData, outputData, \
+ cumSumSizesData, sizesData) \
+ schedule(static) \
+ if(keysSize*outDim > THNN_SPARSE_OMP_THRESHOLD && batchSize > 1)
+ for (j = 0; j < batchSize; j++)
+ {
+ long offset = j == 0 ? 0 : cumSumSizesData[j - 1];
+ real val = 0;
+ real* loutputData = outputData + j*outDim;
+ real* lweightData = weightData;
+ memcpy(loutputData, biasData, outDim*sizeof(real));
+ for (i = 0; i < sizesData[j]; i++)
+ {
+ real val;
+ long woffset = weightStride0*(keysData[offset] + keysOffset);
+ if (maxNormalize)
+ {
+ val = valuesData[offset];
+ real absVal = fabs(val);
+ if (train)
+ {
+ if (absVal > weightData[woffset])
+ {
+ weightData[woffset] = absVal;
+ weightData[woffset+1] = 1/absVal;
+ }
+
+ /*
+ * The following can be used to scale the size of the updates
+ * depending on some rule, e.g. the frequency of a feature, ...
+ * The commented section thereafter is just an example of what can be done:
+ *
+ *```
+ * weightData[woffset+2] = weightData[woffset+2]==0?1:(weightData[woffset+2] / (weightData[woffset+2] + 1));
+ * real alpha = 1;
+ * real beta = 0.01;
+ * real gamma = 1 - 0.000001;
+ * real l = weightData[woffset+2]==0?1/gamma:(weightData[woffset+2] - beta) / (alpha - beta);
+ * l = gamma*l;
+ * weightData[woffset+2] = (alpha-beta)*l + beta;
+ * ```
+ *
+ * TODO: implement a smarter update scale.
+ */
+ weightData[woffset+2] = 1;
+ }
+
+ /* Normalize + Clamp */
+ val = (absVal > weightData[woffset] ? THNN_INDEXLINEAR_SIGN(val):val*weightData[woffset+1]) + weightData[woffset+3];
+ normalizedValuesData[offset] = val;
+
+ lweightData = weightData + woffset + maxNormalize;
+ }
+ else
+ {
+ val = valuesData[offset];
+ lweightData = weightData + woffset;
+ }
+ if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD)
+ {
+ THBlas_(axpy)(outDim, val, lweightData, 1, loutputData, 1);
+ }
+ else
+ {
+ for (k=0; k < outDim; k++)
+ {
+ loutputData[k] += lweightData[k] * val;
+ }
+ }
+ offset++;
+ }
+ }
+ }
+ return;
+}
+
+void THNN_(IndexLinear_updateParameters)(
+ THNNState *state,
+ THTensor *gradWeight,
+ THTensor *gradBias,
+ THTensor *weight,
+ THTensor *bias,
+ THLongTensor *runningKeys,
+ THLongTensor *cumSumSizes,
+ long keysOffset,
+ accreal weightDecay_,
+ accreal learningRate_)
+{
+ real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
+ real learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_);
+ /* Retrieve all the dimensions of the problem */
+ long outDim = THTensor_(size)(bias, 0);
+ long woutDim = THTensor_(size)(weight, 1);
+ int maxNormalize = woutDim - outDim;
+ long keysSize = THLongTensor_size(runningKeys, 0);
+
+ /* Access the storage data/strides */
+ real* gradWeightData = THTensor_(data)(gradWeight);
+ real* weightData = THTensor_(data)(weight);
+ long weightStride0 = weight->stride[0];
+ real* gradBiasData = THTensor_(data)(gradBias);
+ real* biasData = THTensor_(data)(bias);
+ long* keysData = THLongTensor_data(runningKeys);
+
+ /* Make sure these inputs are contiguous to accelerate computations */
+ THArgCheck(THTensor_(isContiguous)(gradWeight), 1, "gradWeight must be contiguous");
+ THArgCheck(THTensor_(isContiguous)(gradBias), 2, "gradBias vector must be contiguous");
+ THArgCheck(THTensor_(isContiguous)(weight), 3, "gradBias vector must be contiguous");
+ THArgCheck(THTensor_(isContiguous)(bias), 4, "gradBias vector must be contiguous");
+ THArgCheck(THLongTensor_isContiguous(runningKeys), 5, "keys vector must be contiguous");
+
+ int j,k;
+ long offset = 0;
+
+ /* Update the bias first */
+ THVector_(cadd)(biasData, biasData, gradBiasData, -learningRate, outDim);
+
+ /* Separate cases: output dimension is == 1, or > 1
+ * This allows for some optimizations.
+ * No multithreading here as this could
+ * corrupt the results (hogwild style) */
+ if (outDim == 1)
+ {
+ if (maxNormalize)
+ {
+ if (weightDecay)
+ {
+ for (j = 0; j < keysSize; j++)
+ {
+ long woffset = weightStride0*(keysData[j] + keysOffset) + maxNormalize;
+ real lr = learningRate*weightData[woffset-2];
+ weightData[woffset-1] -= weightData[woffset]*gradWeightData[2*j]*lr;
+ weightData[woffset] -= gradWeightData[2*j+1]*lr - weightDecay * weightData[woffset-2] * weightData[woffset];
+ }
+ }
+ else
+ {
+ for (j = 0; j < keysSize; j++)
+ {
+ long woffset = weightStride0*(keysData[j] + keysOffset) + maxNormalize;
+ real lr = learningRate*weightData[woffset-2];
+ weightData[woffset-1] -= weightData[woffset]*gradWeightData[2*j]*lr;
+ weightData[woffset] -= gradWeightData[2*j+1]*lr;
+ }
+ }
+ }
+ else
+ {
+ if (weightDecay)
+ {
+ for (j = 0; j < keysSize; j++)
+ {
+ long woffset = weightStride0*(keysData[j] + keysOffset);
+ weightData[woffset] -= gradWeightData[j]*learningRate + weightDecay * weightData[woffset];
+ }
+ }
+ else
+ {
+ for (j = 0; j < keysSize; j++)
+ {
+ weightData[weightStride0*(keysData[j] + keysOffset)] -= gradWeightData[j]*learningRate;
+ }
+ }
+ }
+ }
+ else
+ {
+ for (j = 0; j < keysSize; j++)
+ {
+ real lr = learningRate;
+ real wd = weightDecay;
+ real* lweightData;
+ long woffset = weightStride0*(keysData[j] + keysOffset);
+ real* lgradWeightData = gradWeightData + j*outDim;
+ if (maxNormalize)
+ {
+ lgradWeightData += j*outDim;
+ /* weightData[woffset + 2] */
+ lweightData = weightData + woffset + maxNormalize - 2;
+ lr = lr*lweightData[0];
+ wd = weightDecay*lweightData[0];
+ /* weightData[woffset + 3] */
+ lweightData++;
+ for (k=0; k < outDim; k++)
+ {
+ lweightData[0] -= lgradWeightData[k]*lweightData[k+1]*lr;
+ }
+ lweightData++;
+ lgradWeightData += outDim;
+ }
+ else
+ {
+ lweightData = weightData + woffset;
+ }
+
+ /* We do sparse weight decay.
+ * We think it makes more sense. */
+ if (weightDecay)
+ {
+ for (k=0; k < outDim; k++)
+ {
+ lweightData[k] -= lweightData[k]*wd;
+ }
+ }
+
+ if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD)
+ {
+ THBlas_(axpy)(outDim, -lr, lgradWeightData, 1, lweightData, 1);
+ }
+ else
+ {
+ for (k=0; k < outDim; k++)
+ {
+ lweightData[k] -= lgradWeightData[k]*lr;
+ }
+ }
+ }
+ }
+}
+
+
+void THNN_(IndexLinear_accUpdateGradParameters)(
+ THNNState *state,
+ THLongTensor *keys,
+ long keysOffset,
+ THTensor *values,
+ THLongTensor *sizes,
+ THLongTensor *cumSumSizes,
+ THTensor *gradOutput,
+ THTensor *weight,
+ THTensor *bias,
+ accreal weightDecay_,
+ accreal scale_)
+{
+ real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
+ real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+ /* Retrieve all the dimensions of the problem */
+ long batchSize = THLongTensor_size(sizes, 0);
+ long keysSize = THLongTensor_size(keys, 0);
+ long outDim = THTensor_(size)(bias, 0);
+ long woutDim = THTensor_(size)(weight, 1);
+ int maxNormalize = woutDim - outDim;
+ THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements");
+
+ /* Access the storage data/strides */
+ real* gradOutputData = THTensor_(data)(gradOutput);
+ real* valuesData =THTensor_(data)(values);
+ real* weightData = THTensor_(data)(weight);
+ real* biasData = THTensor_(data)(bias);
+ long weightStride0 = weight->stride[0];
+ long biasStride = bias->stride[0];
+ long* keysData = THLongTensor_data(keys);
+ long* sizesData = THLongTensor_data(sizes);
+
+ /* Make sure these inputs are contiguous to accelerate computations */
+ THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous");
+ THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous");
+ THArgCheck(THTensor_(isContiguous)(gradOutput), 6, "gradOutput vector must be contiguous");
+ THArgCheck(THTensor_(isContiguous)(weight), 7, "weight matrix must be contiguous");
+ THArgCheck(THTensor_(isContiguous)(bias), 8, "bias matrix must be contiguous");
+
+ int i,j,k;
+
+ /* Separate cases: output dimension is == 1, or > 1
+ * This allows for some optimizations.
+ * No multithreading here as this could
+ * corrupt the results (hogwild style) */
+ if (outDim == 1)
+ {
+ if (maxNormalize)
+ {
+ long offset = 0;
+ for (j = 0; j < batchSize; j++)
+ {
+ real* lgradOutputData = gradOutputData + j;
+ *biasData -= *lgradOutputData * scale;
+ real val = *lgradOutputData * scale;
+ real* lweightData = weightData;
+ for (i = 0; i < sizesData[j]; i++)
+ {
+ long idx = weightStride0*(keysData[offset] + keysOffset) + maxNormalize;
+ weightData[idx-1] -= weightData[idx]*val*weightData[idx-2];
+ weightData[idx] -= (val*valuesData[offset] - weightDecay * weightData[idx])*weightData[idx-2];
+ offset++;
+ }
+ }
+
+ offset = 0;
+ for (j = 0; j < batchSize; j++)
+ {
+ real* lweightData = weightData;
+ for (i = 0; i < sizesData[j]; i++)
+ {
+ long idx = weightStride0*(keysData[offset] + keysOffset) + maxNormalize;
+ weightData[idx-2] = 0;
+ offset++;
+ }
+ }
+ }
+ else
+ {
+ if (weightDecay)
+ {
+ long offset = 0;
+ for (j = 0; j < batchSize; j++)
+ {
+ real* lgradOutputData = gradOutputData + j;
+ *biasData -= *lgradOutputData * scale;
+ real val = *lgradOutputData * scale;
+ real* lweightData = weightData;
+ for (i = 0; i < sizesData[j]; i++)
+ {
+ long idx = weightStride0*(keysData[offset] + keysOffset);
+ weightData[idx] -= val * valuesData[offset] + weightData[idx] * weightDecay;
+ offset++;
+ }
+ }
+ }
+ else
+ {
+ long offset = 0;
+ for (j = 0; j < batchSize; j++)
+ {
+ real val = gradOutputData[j] * scale;
+ for (i = 0; i < sizesData[j]; i++)
+ {
+ weightData[(keysData[offset] + keysOffset)*weightStride0] -= val * valuesData[offset];
+ offset++;
+ }
+ *biasData -= val;
+ }
+ }
+ }
+ }
+ else {
+ long offset = 0;
+ for (j = 0; j < batchSize; j++)
+ {
+ real val = 0;
+ real* lgradOutputData = gradOutputData + j*outDim;
+ real* lweightData = weightData;
+ THVector_(cadd)(biasData, biasData, lgradOutputData, -scale, outDim);
+ for (i = 0; i < sizesData[j]; i++)
+ {
+ real val = valuesData[offset] * scale;
+ real wd = weightDecay;
+
+ // Max normalize case
+ if (maxNormalize)
+ {
+ lweightData = weightData + weightStride0*(keysData[offset] + keysOffset) + (maxNormalize-2);
+ val *= lweightData[0];
+ wd *= lweightData[0];
+ for (k=0; k < outDim; k++)
+ {
+ lweightData[1] -= lweightData[k+2]*scale*lgradOutputData[k]*lweightData[0];
+ }
+ lweightData += 2;
+ }
+ else
+ {
+ lweightData = weightData + weightStride0*(keysData[offset] + keysOffset);
+ }
+
+ /* We do sparse weight decay.
+ * We think it makes more sense. */
+ if (weightDecay)
+ {
+ if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD)
+ {
+ THBlas_(axpy)(outDim, -wd, lweightData, 1, lweightData, 1);
+ }
+ else
+ {
+ for (k=0; k < outDim; k++)
+ {
+ lweightData[k] -= wd * lweightData[k];
+ }
+ }
+ }
+
+ if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD)
+ {
+ THBlas_(axpy)(outDim, -val, lgradOutputData, 1, lweightData, 1);
+ }
+ else
+ {
+ for (k=0; k < outDim; k++)
+ {
+ lweightData[k] -= val * lgradOutputData[k];
+ }
+ }
+ offset++;
+ }
+ }
+
+ /* Max Normalize case:
+ * Reset the smart update scaling if
+ * one does it batch-wise.
+ * TODO: Decide what to do with that piece of code.
+ * NB: If the code belowe is uncommented, so should the commented
+ * code in IndexLinear:zeroGradParameters() */
+
+ /*
+ if (maxNormalize)
+ {
+ offset = 0;
+ for (j = 0; j < batchSize; j++)
+ {
+ real* lweightData = weightData;
+ for (i = 0; i < sizesData[j]; i++)
+ {
+ real val = valuesData[offset] * scale;
+ real wd = weightDecay;
+
+ lweightData = weightData + weightStride0*(keysData[offset] + keysOffset) + (maxNormalize-2);
+ lweightData[0] = 0;
+ offset++;
+ }
+ }
+ }
+ */
+ }
+ return;
+}
+
+void THNN_(IndexLinear_accGradParameters)(
+ THNNState *state,
+ THLongTensor *keys,
+ long keysOffset,
+ THTensor *values,
+ THLongTensor *sizes,
+ THLongTensor *cumSumSizes,
+ THTensor *gradOutput,
+ THTensor *gradWeight,
+ THTensor *gradBias,
+ THTensor *weight,
+ THTensor *bias,
+ THTensor *valuesBuffer,
+ accreal weightDecay_,
+ accreal scale_)
+{
+ real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
+ real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+ /* Retrieve all the dimensions of the problem */
+ long batchSize = THLongTensor_size(sizes, 0);
+ long keysSize = THLongTensor_size(keys, 0);
+ long outDim = THTensor_(size)(bias, 0);
+ long woutDim = THTensor_(size)(weight, 1);
+ long maxNormalize = (woutDim - outDim) > 0 ?1:0;
+ THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements");
+ long* sizesData = THLongTensor_data(sizes);
+
+ /* COmpute the cumulative sizes */
+ THLongTensor* cumSizes = THLongTensor_new();
+ THLongTensor_cumsum(cumSizes, sizes, 0);
+ long* cumSizesData = THLongTensor_data(cumSizes);
+
+ /* Resize the gradWeight buffer to keep it dense.
+ * That speeds up updates A LOT assuming random mem access. */
+ THTensor_(resize2d)(gradWeight, keysSize, outDim * (maxNormalize>0?2:1));
+
+ /* Access the storage data/strides */
+ real* gradOutputData = THTensor_(data)(gradOutput);
+ real* valuesData =THTensor_(data)(values);
+ real* gradWeightData = THTensor_(data)(gradWeight);
+ real* weightData = THTensor_(data)(weight);
+ real* gradBiasData = THTensor_(data)(gradBias);
+ long gradWeightStride0 = gradWeight->stride[0];
+ long weightStride0 = weight->stride[0];
+ long* keysData = THLongTensor_data(keys);
+
+ /* Make sure these inputs are contiguous to accelerate computations */
+ THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous");
+ THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous");
+ THArgCheck(THTensor_(isContiguous)(gradOutput), 6, "gradOutput vector must be contiguous");
+ THArgCheck(THTensor_(isContiguous)(gradWeight), 7, "gradWeight must be contiguous");
+ THArgCheck(THTensor_(isContiguous)(gradBias), 8, "gradBias vector must be contiguous");
+ THArgCheck(THTensor_(isContiguous)(weight), 9, "weight must be contiguous");
+ THArgCheck(THTensor_(isContiguous)(bias), 10, "bias vector must be contiguous");
+ THArgCheck(THTensor_(isContiguous)(valuesBuffer), 11, "valuesBuffer must be contiguous");
+
+ int i,j,k;
+
+ /* Separate cases: output dimension is == 1, or > 1
+ * This allows for some optimizations.
+ * No multithreading here as this could
+ * corrupt the results (hogwild style) */
+ if (outDim == 1)
+ {
+ for (j = 0; j < batchSize; j++)
+ {
+ long offset = j==0?0:cumSizesData[j-1];
+ real val = gradOutputData[j] * scale;
+ real* lgradWeightData = gradWeightData + offset;
+ real* lvaluesData = valuesData + offset;
+ long end = sizesData[j];
+
+ if (maxNormalize)
+ {
+ lgradWeightData += offset;
+ i = 0;
+ for(;i < end; i++)
+ {
+ lgradWeightData[2*i] = val;
+ lgradWeightData[2*i+1] = val * lvaluesData[i];
+ }
+ }
+ else
+ {
+ i = 0;
+ for(;i < end-4; i += 4)
+ {
+ lgradWeightData[i] = val * lvaluesData[i];
+ lgradWeightData[i+1] = val * lvaluesData[i+1];
+ lgradWeightData[i+2] = val * lvaluesData[i+2];
+ lgradWeightData[i+3] = val * lvaluesData[i+3];
+ }
+
+ for(; i < end; i++)
+ {
+ lgradWeightData[i] = val * lvaluesData[i];
+ }
+ }
+ *gradBiasData += val;
+ offset += end;
+ }
+ }
+ else {
+ for (j = 0; j < batchSize; j++)
+ {
+ long offset = j==0?0:cumSizesData[j-1];
+ real val = 0;
+ real* lgradOutputData = gradOutputData + j*outDim;
+ real* lgradWeightData = gradWeightData;
+ real* lweightData = weightData;
+ THVector_(cadd)(gradBiasData, gradBiasData, lgradOutputData, scale, outDim);
+ for (i = 0; i < sizesData[j]; i++)
+ {
+ real val = valuesData[offset] * scale;
+ lgradWeightData = gradWeightData + offset*outDim;
+ if (maxNormalize)
+ {
+ lgradWeightData += offset*outDim;
+ k = 0;
+ for(;k < outDim-4; k += 4)
+ {
+ lgradWeightData[k] = lgradOutputData[k]*scale;
+ lgradWeightData[k+1] = lgradOutputData[k+1]*scale;
+ lgradWeightData[k+2] = lgradOutputData[k+2]*scale;
+ lgradWeightData[k+3] = lgradOutputData[k+3]*scale;
+ }
+
+ for(; k < outDim; k++)
+ {
+ lgradWeightData[k] = lgradOutputData[k]*scale;
+ }
+ lgradWeightData += outDim;
+ }
+ k = 0;
+ for(;k < outDim-4; k += 4)
+ {
+ lgradWeightData[k] = val * lgradOutputData[k];
+ lgradWeightData[k+1] = val * lgradOutputData[k+1];
+ lgradWeightData[k+2] = val * lgradOutputData[k+2];
+ lgradWeightData[k+3] = val * lgradOutputData[k+3];
+ }
+
+ for(; k < outDim; k++)
+ {
+ lgradWeightData[k] = val * lgradOutputData[k];
+ }
+ offset++;
+ }
+ }
+ }
+ THLongTensor_free(cumSizes);
+ return;
+}
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/L1Cost.c b/contrib/lua-torch/nn/lib/THNN/generic/L1Cost.c
new file mode 100644
index 000000000..53940e894
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/L1Cost.c
@@ -0,0 +1,38 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/L1Cost.c"
+#else
+
+void THNN_(L1Cost_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output)
+{
+ THNN_CHECK_DIM_SIZE(output, 1, 0, 1);
+ accreal sum = 0;
+
+ TH_TENSOR_APPLY(real, input,
+ sum += fabs(*input_data);
+ );
+
+ THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(L1Cost_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput)
+{
+ THNN_CHECK_NELEMENT(input, gradOutput);
+ THTensor_(resizeAs)(gradInput, input);
+ TH_TENSOR_APPLY2(real, gradInput, real, input,
+ if (*input_data > 0)
+ *gradInput_data = 1;
+ else if (*input_data < 0)
+ *gradInput_data = -1;
+ else
+ *gradInput_data = 0;
+ );
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/LeakyReLU.c b/contrib/lua-torch/nn/lib/THNN/generic/LeakyReLU.c
new file mode 100644
index 000000000..074047d83
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/LeakyReLU.c
@@ -0,0 +1,57 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/LeakyReLU.c"
+#else
+
+void THNN_(LeakyReLU_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ accreal negval_,
+ bool inplace)
+{
+ real negval = TH_CONVERT_ACCREAL_TO_REAL(negval_);
+ if (inplace)
+ {
+ TH_TENSOR_APPLY(real, input,
+ if (*input_data <= 0)
+ *input_data *= negval;
+ );
+ THTensor_(set)(output, input);
+ }
+ else
+ {
+ THTensor_(resizeAs)(output, input);
+ TH_TENSOR_APPLY2(real, output, real, input,
+ *output_data = *input_data > 0 ? *input_data : *input_data * negval;
+ );
+ }
+}
+
+void THNN_(LeakyReLU_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ accreal negval_,
+ bool inplace)
+{
+ real negval = TH_CONVERT_ACCREAL_TO_REAL(negval_);
+ THNN_CHECK_NELEMENT(input, gradOutput);
+ if (inplace)
+ {
+ TH_TENSOR_APPLY2(real, gradOutput, real, input,
+ if (*input_data <= 0)
+ *gradOutput_data *= negval;
+ );
+ THTensor_(set)(gradInput, gradOutput);
+ }
+ else
+ {
+ THTensor_(resizeAs)(gradInput, input);
+ TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+ *gradInput_data = *input_data > 0 ? *gradOutput_data : *gradOutput_data * negval;
+ );
+ }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/Linear.c b/contrib/lua-torch/nn/lib/THNN/generic/Linear.c
new file mode 100644
index 000000000..8c5cd115e
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/Linear.c
@@ -0,0 +1,114 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Linear.c"
+#else
+
+void THNN_(Linear_updateAddBuffer)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *addBuffer)
+{
+ long nframe = THTensor_(size)(input,0);
+ long nElement = THTensor_(nElement)(addBuffer);
+ if (nElement != nframe) {
+ THTensor_(resize1d)(addBuffer,nframe);
+ THTensor_(fill)(addBuffer,1.0);
+ }
+}
+
+void THNN_(Linear_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THTensor *weight,
+ THTensor *bias,
+ THTensor *addBuffer)
+{
+ long dim = THTensor_(nDimension)(input);
+ if (dim == 1) {
+ THTensor_(resize1d)(output,THTensor_(size)(weight,0));
+ if (bias) {
+ THTensor_(copy)(output,bias);
+ }
+ else {
+ THTensor_(zero)(output);
+ }
+ THTensor_(addmv)(output,1,output,1,weight,input);
+ }
+ else if (dim == 2) {
+ long nframe = THTensor_(size)(input,0);
+ long nElement = THTensor_(nElement)(output);
+ THTensor_(resize2d)(output,nframe,THTensor_(size)(weight,0));
+ if (THTensor_(nElement)(output) != nElement) {
+ THTensor_(zero)(output);
+ }
+ THNN_(Linear_updateAddBuffer)(state,input,addBuffer);
+ THTensor *tweight = THTensor_(new)();
+ THTensor_(transpose)(tweight,weight,0,1);
+ THTensor_(addmm)(output,0,output,1,input,tweight);
+ THTensor_(free)(tweight);
+ if (bias) {
+ THTensor_(addr)(output,1,output,1,addBuffer,bias);
+ }
+ }
+}
+
+void THNN_(Linear_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *weight)
+{
+ if (gradInput) {
+ long nElement = THTensor_(nElement)(gradInput);
+ THTensor_(resizeAs)(gradInput,input);
+ if (THTensor_(nElement)(gradInput) != nElement) {
+ THTensor_(zero)(gradInput);
+ }
+
+ long dim = THTensor_(nDimension)(input);
+ if (dim == 1) {
+ THTensor *tweight = THTensor_(new)();
+ THTensor_(transpose)(tweight,weight,0,1);
+ THTensor_(addmv)(gradInput,0,gradInput,1,tweight,gradOutput);
+ THTensor_(free)(tweight);
+ }
+ else if (dim == 2) {
+ THTensor_(addmm)(gradInput,0,gradInput,1,gradOutput,weight);
+ }
+ }
+}
+
+void THNN_(Linear_accGradParameters)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *weight,
+ THTensor *bias,
+ THTensor *gradWeight,
+ THTensor *gradBias,
+ THTensor *addBuffer,
+ accreal scale_)
+{
+ real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+ long dim = THTensor_(nDimension)(input);
+ if (dim == 1) {
+ THTensor_(addr)(gradWeight,1,gradWeight,scale,gradOutput,input);
+ if (bias) {
+ THTensor_(cadd)(gradBias,gradBias,scale,gradOutput);
+ }
+ }
+ else if (dim == 2) {
+ THTensor *tgradOutput = THTensor_(new)();
+ THTensor_(transpose)(tgradOutput,gradOutput,0,1);
+ THTensor_(addmm)(gradWeight,1,gradWeight,scale,tgradOutput,input);
+ if (bias) {
+ THNN_(Linear_updateAddBuffer)(state,input,addBuffer);
+ THTensor_(addmv)(gradBias,1,gradBias,scale,tgradOutput,addBuffer);
+ }
+ THTensor_(free)(tgradOutput);
+ }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/LogSigmoid.c b/contrib/lua-torch/nn/lib/THNN/generic/LogSigmoid.c
new file mode 100644
index 000000000..651d56002
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/LogSigmoid.c
@@ -0,0 +1,36 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/LogSigmoid.c"
+#else
+
+void THNN_(LogSigmoid_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THTensor *buffer)
+{
+ THTensor_(resizeAs)(output, input);
+ THTensor_(resizeAs)(buffer, input);
+
+ TH_TENSOR_APPLY3(real, output, real, input, real, buffer,
+ real z = exp(-*input_data);
+ *buffer_data = z;
+ *output_data = -log(1. + z);
+ );
+}
+
+void THNN_(LogSigmoid_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *buffer)
+{
+ THNN_CHECK_NELEMENT(input, gradOutput);
+ THTensor_(resizeAs)(gradInput, buffer);
+ TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, buffer,
+ real z = *buffer_data;
+ *gradInput_data = *gradOutput_data * z / (1. + z);
+ );
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/LogSoftMax.c b/contrib/lua-torch/nn/lib/THNN/generic/LogSoftMax.c
new file mode 100644
index 000000000..a7280422b
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/LogSoftMax.c
@@ -0,0 +1,137 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/LogSoftMax.c"
+#else
+
+void THNN_(LogSoftMax_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output)
+{
+ real *input_data, *output_data;
+ ptrdiff_t nframe = 0, dim = 0, stride = 0;
+ ptrdiff_t t, d;
+
+ if (input->nDimension == 1)
+ {
+ nframe = 1;
+ dim = input->size[0];
+ stride = 1;
+ }
+ else if (input->nDimension == 2)
+ {
+ nframe = input->size[0];
+ dim = input->size[1];
+ stride = 1;
+ }
+ else if (input->nDimension == 3)
+ {
+ nframe = 1;
+ dim = input->size[0];
+ stride = input->size[1]*input->size[2];
+ }
+ else if (input->nDimension == 4)
+ {
+ nframe = input->size[0];
+ dim = input->size[1];
+ stride = input->size[2]*input->size[3];
+ }
+ else
+ THArgCheck(0, 2, "1D, 2D, 3D or 4D tensor expected");
+
+ input = THTensor_(newContiguous)(input);
+ THTensor_(resizeAs)(output, input);
+
+ real *input_data0 = THTensor_(data)(input);
+ real *output_data0 = THTensor_(data)(output);
+
+ accreal logsum;
+ real maxInput;
+ #pragma omp parallel for private(t, d, maxInput, logsum, input_data, output_data)
+ for (t = 0; t < stride*nframe; t++)
+ {
+ logsum = 0;
+ maxInput = -THInf;
+ input_data = input_data0 + (t/stride)*dim*stride + t % stride;
+ output_data = output_data0 + (t/stride)*dim*stride + t % stride;
+
+ for (d = 0; d < dim; d++)
+ maxInput = THMax(maxInput, input_data[d*stride]);
+
+ for (d = 0; d < dim; d++)
+ logsum += exp(input_data[d*stride] - maxInput);
+ logsum = maxInput + log(logsum);
+
+ for (d = 0; d < dim; d++)
+ output_data[d*stride] = input_data[d*stride] - logsum;
+ }
+
+ THTensor_(free)(input);
+}
+
+void THNN_(LogSoftMax_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *output)
+{
+ THNN_CHECK_SHAPE(input, gradOutput);
+ real *gradInput_data, *gradOutput_data, *output_data;
+ ptrdiff_t nframe = 0, dim = 0, stride = 0;
+ ptrdiff_t t, d;
+
+ if (output->nDimension == 1)
+ {
+ nframe = 1;
+ dim = output->size[0];
+ stride = 1;
+ }
+ else if (output->nDimension == 2)
+ {
+ nframe = output->size[0];
+ dim = output->size[1];
+ stride = 1;
+ }
+ else if (output->nDimension == 3)
+ {
+ nframe = 1;
+ dim = output->size[0];
+ stride = output->size[1]*output->size[2];
+ }
+ else if (output->nDimension == 4)
+ {
+ nframe = output->size[0];
+ dim = output->size[1];
+ stride = output->size[2]*output->size[3];
+ }
+ else
+ THError("1D, 2D, 3D or 4D tensor expected");
+
+ output = THTensor_(newContiguous)(output);
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+
+ THTensor_(resizeAs)(gradInput, output);
+ real *gradInput_data0 = THTensor_(data)(gradInput);
+ real *output_data0 = THTensor_(data)(output);
+ real *gradOutput_data0 = THTensor_(data)(gradOutput);
+ accreal sum;
+ #pragma omp parallel for private(t, sum, d, gradInput_data, output_data, gradOutput_data)
+ for (t = 0; t < stride*nframe; t++)
+ {
+ sum = 0;
+ gradInput_data = gradInput_data0 + (t/stride)*dim*stride + t % stride;
+ output_data = output_data0 + (t/stride)*dim*stride + t % stride;
+ gradOutput_data = gradOutput_data0 + (t/stride)*dim*stride + t % stride;
+
+ for (d = 0; d < dim; d++)
+ sum += gradOutput_data[d*stride];
+
+ for (d = 0; d < dim; d++)
+ gradInput_data[d*stride] = gradOutput_data[d*stride] - exp(output_data[d*stride])*sum;
+ }
+
+ THTensor_(free)(gradOutput);
+ THTensor_(free)(output);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/LookupTable.c b/contrib/lua-torch/nn/lib/THNN/generic/LookupTable.c
new file mode 100644
index 000000000..46bc2c3c1
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/LookupTable.c
@@ -0,0 +1,225 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/LookupTable.c"
+#else
+
+static void THNN_(LookupTable_resetCount)(
+ THInteger_t *count_data,
+ THIndexTensor *input)
+{
+ ptrdiff_t i;
+ THIndex_t *input_data = THIndexTensor_(data)(input);
+ ptrdiff_t numel = THIndexTensor_(nElement)(input);
+
+ for (i = 0; i<numel; i++)
+ {
+ long k = input_data[i] - TH_INDEX_BASE;
+ count_data[k] = 0;
+ }
+ for (i = 0; i<numel; i++)
+ {
+ long k = input_data[i] - TH_INDEX_BASE;
+ count_data[k]++;
+ }
+}
+
+void THNN_(LookupTable_accGradParameters)(
+ THNNState *state,
+ THIndexTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradWeight,
+ THIntegerTensor *count,
+ THTensor *sorted,
+ THIndexTensor *indices,
+ bool scaleGradByFreq,
+ int paddingValue,
+ accreal ascale)
+{
+ real scale = TH_CONVERT_ACCREAL_TO_REAL(ascale);
+ ptrdiff_t i;
+ THInteger_t *count_data = NULL;
+
+ if (scaleGradByFreq)
+ {
+ THIntegerTensor_(resize1d)(count, gradWeight->size[0]);
+ count_data = THIntegerTensor_(data)(count);
+ }
+
+ if (!THTensor_(isContiguous)(gradWeight))
+ THError("gradWeight must be contiguous");
+ if (!THIndexTensor_(isContiguous)(input))
+ THError("input must be contiguous");
+ if (THIndexTensor_(nDimension)(input) != 1 && THIndexTensor_(nDimension)(input) != 2) {
+ THDescBuff s1 = THIndexTensor_(sizeDesc)(input);
+ THError("input must be a vector or matrix, but is of shape: %s", s1.str);
+ }
+
+ THIndex_t *input_data = THIndexTensor_(data)(input);
+ ptrdiff_t numel = THIndexTensor_(nElement)(input);
+ long numw = THTensor_(size)(gradWeight, 0);
+
+ // check that inputs are all within range
+ for (i=0; i<numel; i++)
+ if (input_data[i] < TH_INDEX_BASE || input_data[i] >= numw + TH_INDEX_BASE) {
+ THError("inputs need to be in the range %ld <= input < %ld, "
+ "but got input of value: %ld", TH_INDEX_BASE, (numw + TH_INDEX_BASE),
+ input_data[i]);
+ }
+
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+
+ real *gw = THTensor_(data)(gradWeight);
+ real *go = THTensor_(data)(gradOutput);
+ long stride = THTensor_(stride)(gradWeight, 0);
+
+ if (count_data)
+ THNN_(LookupTable_resetCount)(count_data, input);
+
+#ifdef _OPENMP
+ if (numel > 1000)
+ {
+ // The strategy is to parallelize over sections of the vocabulary, so that
+ // thread 1 handles updates to gradWeight[0..nVocab/nThreads]. Every thread
+ // has to traverse the entire input, but the dominating factor is the axpy
+ // BLAS call.
+ #pragma omp parallel private(i)
+ {
+ int tid = omp_get_thread_num();
+ int nthreads = omp_get_num_threads();
+
+ long start = tid * (numw/nthreads + 1);
+ long end = start + (numw/nthreads + 1);
+ for (i=0; i<numel; i++)
+ {
+ if (input_data[i] != paddingValue)
+ {
+ long k = input_data[i] - TH_INDEX_BASE;
+ if (k >= start && k < end)
+ {
+ real scale_ = scale;
+ if (count_data) scale_ /= count_data[k];
+ THBlas_(axpy)(stride, scale_, go + i*stride, 1, gw + k*stride, 1);
+ }
+ }
+ }
+ }
+
+ THTensor_(free)(gradOutput);
+ return;
+ }
+#endif
+
+ for (i=0; i<numel; i++)
+ {
+ if (input_data[i] != paddingValue)
+ {
+ long k = input_data[i] - TH_INDEX_BASE;
+ real scale_ = scale;
+ if (count_data) scale_ /= count_data[k];
+ THBlas_(axpy)(stride, scale_, go + i*stride, 1, gw + k*stride, 1);
+ }
+ }
+
+ THTensor_(free)(gradOutput);
+}
+
+/*
+ * Keep the norm of weight smaller than maxNorm
+ */
+
+static void THNN_(LookupTable_renormRow)(
+ real *row_data,
+ long stride,
+ real maxNorm,
+ real normType)
+{
+ real norm = 0;
+ real new_norm;
+ long j;
+ for (j=0; j<stride; j++)
+ {
+ if (normType == 1) {
+ norm += fabs(row_data[j]);
+ } else if (normType == 2) {
+ norm += row_data[j] * row_data[j];
+ } else {
+ norm += pow(fabs(row_data[j]), normType);
+ }
+ }
+ norm = pow(norm, 1.0 / normType);
+ if (norm > maxNorm)
+ {
+ new_norm = maxNorm / (norm + 1e-7);
+ for (j=0; j<stride; j++) {
+ row_data[j] *= new_norm;
+ }
+ }
+}
+
+static int THNN_(compare_THIndex)(const void* a, const void* b)
+{
+ return *(const THIndex_t*)a < *(const THIndex_t*)b ? -1 : 1;
+}
+
+void THNN_(LookupTable_renorm)(
+ THNNState *state,
+ THIndexTensor *idx,
+ THTensor *weight,
+ accreal maxNorm_,
+ accreal normType_)
+{
+ real maxNorm = TH_CONVERT_ACCREAL_TO_REAL(maxNorm_);
+ real normType = TH_CONVERT_ACCREAL_TO_REAL(normType_);
+ if (!THTensor_(isContiguous)(weight))
+ THError("weight must be contiguous");
+ if (!THIndexTensor_(isContiguous)(idx))
+ THError("input must be contiguous");
+ if (THIndexTensor_(nDimension)(idx) != 1)
+ THError("idx must be a vector");
+ if (normType <= 0)
+ THError("non-positive-norm not supported");
+
+ ptrdiff_t i;
+ THIndex_t *row_idx = THIndexTensor_(data)(idx);
+ ptrdiff_t numel = THIndexTensor_(nElement)(idx);
+
+ long numw = THTensor_(size)(weight, 0);
+ long stride = THTensor_(stride)(weight, 0);
+ real *gw = THTensor_(data)(weight);
+ for (i=0; i<numel; i++) {
+ if (row_idx[i] < TH_INDEX_BASE || row_idx[i] >= numw + TH_INDEX_BASE) {
+ THError("input need to be in the range %ld <= input < %ld, "
+ "but got input of value: %ld", TH_INDEX_BASE, (numw + TH_INDEX_BASE),
+ row_idx[i]);
+ }
+ }
+ // get unique indices
+ qsort(row_idx, numel, sizeof(THIndex_t), THNN_(compare_THIndex));
+ ptrdiff_t ptr = 0;
+ for (i=0; i<numel; i++)
+ if (i == 0 || row_idx[i] != row_idx[i-1])
+ row_idx[ptr++] = row_idx[i];
+ numel = ptr;
+
+#ifdef _OPENMP
+ if (numel > 1000)
+ {
+ // The strategy is to parallelize over the rows that appear in
+ // row_idx, so that thread 1 handles the rows in row_idx[0..numel/nThreads].
+ // This distributes the work evenly to each thread.
+ #pragma omp parallel for private(i)
+ for (i=0; i<numel; i++)
+ {
+ long k = row_idx[i] - TH_INDEX_BASE;
+ THNN_(LookupTable_renormRow)(gw + k*stride, stride, maxNorm, normType);
+ }
+ return;
+ }
+#endif
+ for (i=0; i<numel; i++)
+ {
+ long k = row_idx[i] - TH_INDEX_BASE;
+ THNN_(LookupTable_renormRow)(gw + k*stride, stride, maxNorm, normType);
+ }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/MSECriterion.c b/contrib/lua-torch/nn/lib/THNN/generic/MSECriterion.c
new file mode 100644
index 000000000..58911f6f0
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/MSECriterion.c
@@ -0,0 +1,45 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/MSECriterion.c"
+#else
+
+void THNN_(MSECriterion_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *target,
+ THTensor *output,
+ bool sizeAverage)
+{
+ THNN_CHECK_NELEMENT(input, target);
+ THNN_CHECK_DIM_SIZE(output, 1, 0, 1);
+
+ real sum = 0;
+
+ TH_TENSOR_APPLY2(real, input, real, target,
+ real z = (*input_data - *target_data);
+ sum += z*z;
+ );
+
+ if (sizeAverage)
+ sum /= THTensor_(nElement)(input);
+
+ THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(MSECriterion_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *target,
+ THTensor *gradInput,
+ bool sizeAverage)
+{
+ THNN_CHECK_NELEMENT(input, target);
+
+ real norm = (sizeAverage ? 2./((real)THTensor_(nElement)(input)) : 2.);
+
+ THTensor_(resizeAs)(gradInput, input);
+ TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+ *gradInput_data = norm * (*input_data - *target_data);
+ );
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/MarginCriterion.c b/contrib/lua-torch/nn/lib/THNN/generic/MarginCriterion.c
new file mode 100644
index 000000000..d6d9b60b9
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/MarginCriterion.c
@@ -0,0 +1,47 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/MarginCriterion.c"
+#else
+
+void THNN_(MarginCriterion_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *target,
+ THTensor *output,
+ bool sizeAverage,
+ accreal margin_)
+{
+ real margin = TH_CONVERT_ACCREAL_TO_REAL(margin_);
+ THNN_CHECK_NELEMENT(input, target);
+ THNN_CHECK_DIM_SIZE(output, 1, 0, 1);
+ real sum = 0;
+
+ TH_TENSOR_APPLY2(real, input, real, target,
+ real z = (margin - *input_data * *target_data);
+ sum += z>0 ? z : 0;
+ );
+
+ if (sizeAverage)
+ sum /= THTensor_(nElement)(input);
+
+ THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(MarginCriterion_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *target,
+ THTensor *gradInput,
+ bool sizeAverage,
+ accreal margin_)
+{
+ real margin = TH_CONVERT_ACCREAL_TO_REAL(margin_);
+ THNN_CHECK_NELEMENT(input, target);
+ real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
+
+ THTensor_(resizeAs)(gradInput, input);
+ TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+ *gradInput_data = (*input_data * *target_data) < margin ? -norm * *target_data : 0;
+ );
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/MultiLabelMarginCriterion.c b/contrib/lua-torch/nn/lib/THNN/generic/MultiLabelMarginCriterion.c
new file mode 100644
index 000000000..16398c13c
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/MultiLabelMarginCriterion.c
@@ -0,0 +1,184 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/MultiLabelMarginCriterion.c"
+#else
+
+// TODO: improve error messages
+void THNN_(MultiLabelMarginCriterion_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THIndexTensor *target,
+ THTensor *output,
+ THTensor *isTarget,
+ bool sizeAverage)
+{
+ real *input_data, *isTarget_data;
+ THIndex_t *target_data;
+ long nframe, dim;
+ long t, d, dt, ddt;
+ real sum;
+
+ THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2,
+ "vector or matrix expected");
+
+ if (input->nDimension == 1)
+ {
+ nframe = 1;
+ dim = input->size[0];
+ THArgCheck((target->nDimension == 1) && (target->size[0] == dim), 3,
+ "inconsistent target size");
+ }
+ else
+ {
+ nframe = input->size[0];
+ dim = input->size[1];
+ THArgCheck((target->nDimension == 2) && (target->size[0] == nframe)
+ && (target->size[1] == dim), 3, "inconsistent target size");
+ }
+
+ THArgCheck(THIndexTensor_(minall)(target) >= -1+TH_INDEX_BASE, 3, "target out of range");
+ THArgCheck(THIndexTensor_(maxall)(target) < dim+TH_INDEX_BASE, 3, "target out of range");
+
+ target = THIndexTensor_(newContiguous)(target);
+ input = THTensor_(newContiguous)(input);
+ input_data = THTensor_(data)(input);
+ target_data = THIndexTensor_(data)(target);
+
+ THNN_resizeAs_indices(isTarget, target);
+ THTensor_(zero)(isTarget);
+ isTarget_data = THTensor_(data)(isTarget);
+
+ sum = 0;
+ for (t = 0; t < nframe; t++)
+ {
+ for (ddt = 0; ddt < dim; ddt++)
+ {
+ THIndex_t target_idx = target_data[ddt] - TH_INDEX_BASE;
+ if (target_idx < 0)
+ break;
+ isTarget_data[target_idx] = 1;
+ }
+ for (dt = 0; dt < dim; dt++)
+ {
+ THIndex_t target_idx = target_data[dt] - TH_INDEX_BASE;
+ real input_target;
+ if (target_idx < 0)
+ break;
+
+ input_target = input_data[target_idx];
+ for (d = 0; d < dim; d++)
+ {
+ if (!isTarget_data[d])
+ {
+ real z = 1 - input_target + input_data[d];
+ if (z > 0)
+ sum += z;
+ }
+ }
+ }
+ input_data += dim;
+ target_data += dim;
+ isTarget_data += dim;
+ }
+
+ sum /= dim;
+ if (sizeAverage)
+ sum /= nframe;
+
+ THTensor_(set1d)(output, 0, sum);
+
+ THTensor_(free)(input);
+ THIndexTensor_(free)(target);
+}
+
+void THNN_(MultiLabelMarginCriterion_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THIndexTensor *target,
+ THTensor *gradInput,
+ THTensor *isTarget,
+ bool sizeAverage)
+{
+ real *input_data;
+ real *gradInput_data;
+ THIndex_t *target_data;
+ real *isTarget_data;
+ long nframe, dim;
+ long t, d, dt;
+ real g;
+
+ THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2,
+ "vector or matrix expected");
+
+ if (input->nDimension == 1)
+ {
+ nframe = 1;
+ dim = input->size[0];
+ THArgCheck((target->nDimension == 1) && (target->size[0] == dim), 3,
+ "inconsistent target size");
+ THArgCheck((isTarget->nDimension == 1) && (isTarget->size[0] == dim), 3,
+ "inconsistent isTarget size");
+ }
+ else
+ {
+ nframe = input->size[0];
+ dim = input->size[1];
+ THArgCheck((target->nDimension == 2) && (target->size[0] == nframe)
+ && (target->size[1] == dim), 3, "inconsistent target size");
+ THArgCheck((isTarget->nDimension == 2) && (isTarget->size[0] == nframe)
+ && (isTarget->size[1] == dim), 3, "inconsistent isTarget size");
+ }
+
+ THArgCheck(THIndexTensor_(minall)(target) >= -1+TH_INDEX_BASE, 3, "target out of range");
+ THArgCheck(THIndexTensor_(maxall)(target) < dim+TH_INDEX_BASE, 3, "target out of range");
+
+ THArgCheck(THTensor_(minall)(isTarget) >= 0, 3, "isTarget out of range");
+ THArgCheck(THTensor_(maxall)(isTarget) <= 1, 3, "isTarget out of range");
+
+ target = THIndexTensor_(newContiguous)(target);
+ input = THTensor_(newContiguous)(input);
+ isTarget = THTensor_(newContiguous)(isTarget);
+ input_data = THTensor_(data)(input);
+ target_data = THIndexTensor_(data)(target);
+ isTarget_data = THTensor_(data)(isTarget);
+
+ g = sizeAverage ? ( 1./((real)(nframe*dim)) ) : ( 1./((real)dim) );
+
+ THTensor_(resizeAs)(gradInput, input);
+ THTensor_(zero)(gradInput);
+ gradInput_data = THTensor_(data)(gradInput);
+
+ for (t = 0; t < nframe; t++)
+ {
+ for (dt = 0; dt < dim; dt++)
+ {
+ THIndex_t target_idx = target_data[dt] - TH_INDEX_BASE;
+ real input_target;
+ if (target_idx < 0)
+ break;
+
+ input_target = input_data[target_idx];
+ for (d = 0; d < dim; d++)
+ {
+ if (!isTarget_data[d])
+ {
+ real z = 1 - input_target + input_data[d];
+ if (z > 0)
+ {
+ gradInput_data[target_idx] -= g;
+ gradInput_data[d] += g;
+ }
+ }
+ }
+ }
+ input_data += dim;
+ target_data += dim;
+ isTarget_data += dim;
+ gradInput_data += dim;
+ }
+
+ THTensor_(free)(input);
+ THIndexTensor_(free)(target);
+ THTensor_(free)(isTarget);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/MultiMarginCriterion.c b/contrib/lua-torch/nn/lib/THNN/generic/MultiMarginCriterion.c
new file mode 100644
index 000000000..2f8f8ff58
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/MultiMarginCriterion.c
@@ -0,0 +1,168 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/MultiMarginCriterion.c"
+#else
+
+// TODO: improve error messages
+void THNN_(MultiMarginCriterion_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THIndexTensor *target,
+ THTensor *output,
+ bool sizeAverage,
+ int p,
+ THTensor *weights,
+ accreal margin_)
+{
+ real margin = TH_CONVERT_ACCREAL_TO_REAL(margin_);
+ real *input_data, *weights_data;
+ THIndex_t *target_data;
+ long nframe, dim;
+ long t, d;
+ real sum;
+
+ THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2,
+ "vector or matrix expected");
+
+ if (input->nDimension == 1)
+ {
+ nframe = 1;
+ dim = input->size[0];
+ }
+ else
+ {
+ nframe = input->size[0];
+ dim = input->size[1];
+ THArgCheck((target->nDimension == 1) && (target->size[0] == nframe), 3,
+ "inconsistent target size");
+ }
+
+ for (t = 0; t < nframe; t++)
+ {
+ THIndex_t idx = THIndexTensor_(get1d)(target, t);
+ THArgCheck((idx >= TH_INDEX_BASE) && (idx < dim + TH_INDEX_BASE), 3,
+ "target out of range");
+ }
+
+ input = THTensor_(newContiguous)(input);
+ target = THIndexTensor_(newContiguous)(target);
+ weights = weights ? THTensor_(newContiguous)(weights) : NULL;
+ input_data = THTensor_(data)(input);
+ target_data = THIndexTensor_(data)(target);
+ weights_data = weights ? THTensor_(data)(weights) : NULL;
+
+ sum = 0;
+ for (t = 0; t < nframe; t++)
+ {
+ THIndex_t target_idx = target_data[t] - TH_INDEX_BASE;
+ real input_target = input_data[target_idx];
+ for (d = 0; d < dim; d++)
+ {
+ real z = margin - input_target + input_data[d];
+ if (d == target_idx)
+ continue;
+
+ if (z > 0) {
+ real h = (p==1) ? z : z*z;
+ if(weights_data)
+ h *= weights_data[target_idx];
+ sum += h;
+ }
+ }
+ input_data += dim;
+ }
+
+ sum /= dim;
+ if(sizeAverage)
+ sum /= nframe;
+
+ THTensor_(set1d)(output, 0, sum);
+
+ THTensor_(free)(input);
+ THIndexTensor_(free)(target);
+ if(weights)
+ THTensor_(free)(weights);
+}
+
+void THNN_(MultiMarginCriterion_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THIndexTensor *target,
+ THTensor *gradInput,
+ bool sizeAverage,
+ int p,
+ THTensor *weights,
+ accreal margin_)
+{
+ real margin = TH_CONVERT_ACCREAL_TO_REAL(margin_);
+ real *input_data;
+ real *gradInput_data;
+ THIndex_t *target_data;
+ real *weights_data;
+ long nframe, dim;
+ long t, d;
+ real g;
+
+ THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2,
+ "vector or matrix expected");
+
+ if (input->nDimension == 1)
+ {
+ nframe = 1;
+ dim = input->size[0];
+ }
+ else
+ {
+ nframe = input->size[0];
+ dim = input->size[1];
+ THArgCheck((target->nDimension == 1) && (target->size[0] == nframe), 3,
+ "inconsistent target size");
+ }
+
+ g = (sizeAverage ? 1./((real)(nframe*dim)) : 1./((real)dim));
+
+ input = THTensor_(newContiguous)(input);
+ target = THIndexTensor_(newContiguous)(target);
+ input_data = THTensor_(data)(input);
+
+ THTensor_(resizeAs)(gradInput, input);
+ gradInput_data = THTensor_(data)(gradInput);
+
+ target_data = THIndexTensor_(data)(target);
+ weights = weights ? THTensor_(newContiguous)(weights) : NULL;
+ weights_data = weights ? THTensor_(data)(weights) : NULL;
+
+ for (t = 0; t < nframe; t++)
+ {
+ THIndex_t target_idx = target_data[t] - TH_INDEX_BASE;
+ real input_target = input_data[target_idx];
+ real gradInput_target = 0;
+ for (d = 0; d < dim; d++)
+ {
+ real z = margin - input_target + input_data[d];
+ if (d == target_idx)
+ continue;
+
+ if (z > 0)
+ {
+ real h = (p == 1) ? g : 2*g*z;
+ if(weights_data)
+ h *= weights_data[target_idx];
+ gradInput_target -= h;
+ gradInput_data[d] = h;
+ }
+ else
+ gradInput_data[d] = 0;
+ }
+ gradInput_data[target_idx] = gradInput_target;
+
+ input_data += dim;
+ gradInput_data += dim;
+ }
+
+ THTensor_(free)(input);
+ THIndexTensor_(free)(target);
+ if(weights)
+ THTensor_(free)(weights);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/PReLU.c b/contrib/lua-torch/nn/lib/THNN/generic/PReLU.c
new file mode 100644
index 000000000..488322fde
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/PReLU.c
@@ -0,0 +1,207 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/PReLU.c"
+#else
+
+void THNN_(PReLU_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THTensor *weight,
+ THIndex_t nOutputPlane)
+{
+ THTensor_(resizeAs)(output, input);
+
+ if (nOutputPlane == 0)
+ {
+ // handle shared parameter case
+ real w = *THTensor_(data)(weight);
+ TH_TENSOR_APPLY2(real, output, real, input,
+ *output_data = (*input_data > 0) ? *input_data : w*(*input_data);
+ );
+ }
+ else
+ {
+ input = THTensor_(newContiguous)(input);
+ long bs = 1, ks = 1;
+ {
+ long input_ndim = THTensor_(nDimension)(input);
+ if (input->size[input_ndim > 1] != nOutputPlane)
+ THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size[input_ndim > 1]);
+
+ if (input_ndim > 1) {
+ bs = input->size[0];
+ for (int d = 2; d < input_ndim; d++) {
+ ks *= input->size[d];
+ }
+ }
+ }
+
+ real *output_data = THTensor_(data)(output);
+ real *input_data = THTensor_(data)(input);
+ real *weight_data = THTensor_(data)(weight);
+ THIndex_t i, j, k;
+#pragma omp parallel for private(j,k)
+ for (i = 0; i < bs; ++i)
+ {
+ real* n_input_data = input_data + i*nOutputPlane*ks;
+ real* n_output_data = output_data + i*nOutputPlane*ks;
+ for (j = 0; j < nOutputPlane; ++j)
+ {
+ for (k = 0; k < ks; ++k)
+ n_output_data[k] = (n_input_data[k] > 0) ? n_input_data[k] : weight_data[j] * n_input_data[k];
+ n_input_data += ks;
+ n_output_data += ks;
+ }
+ }
+ THTensor_(free)(input);
+ }
+}
+
+void THNN_(PReLU_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *weight,
+ THIndex_t nOutputPlane)
+{
+ THNN_CHECK_NELEMENT(input, gradOutput);
+ THTensor_(resizeAs)(gradInput, input);
+
+ if (nOutputPlane == 0)
+ {
+ real w = THTensor_(data)(weight)[0];
+ TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+ if ((*input_data) > 0)
+ *gradInput_data = *gradOutput_data;
+ else
+ *gradInput_data = w * (*gradOutput_data);
+ );
+ }
+ else
+ {
+ input = THTensor_(newContiguous)(input);
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+ weight = THTensor_(newContiguous)(weight);
+ const real *input_data = THTensor_(data)(input);
+ const real *gradOutput_data = THTensor_(data)(gradOutput);
+ const real *weight_data = THTensor_(data)(weight);
+ real *gradInput_data = THTensor_(data)(gradInput);
+
+ long bs = 1, ks = 1;
+ {
+ long input_ndim = THTensor_(nDimension)(input);
+ if (input->size[input_ndim > 1] != nOutputPlane)
+ THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size[input_ndim > 1]);
+
+ if (input_ndim > 1) {
+ bs = input->size[0];
+ for (int d = 2; d < input_ndim; d++) {
+ ks *= input->size[d];
+ }
+ }
+ }
+
+ THIndex_t i, j, k;
+#pragma omp parallel for private(j,k)
+ for (i = 0; i < bs; ++i)
+ {
+ const real *n_input_data = input_data + i*nOutputPlane*ks;
+ const real *n_gradOutput_data = gradOutput_data + i*nOutputPlane*ks;
+ real *n_gradInput_data = gradInput_data + i*nOutputPlane*ks;
+
+ for (j = 0; j < nOutputPlane; ++j)
+ {
+ real w = weight_data[j];
+ for (k = 0; k < ks; ++k)
+ {
+ if (n_input_data[k] > 0)
+ n_gradInput_data[k] = n_gradOutput_data[k];
+ else
+ n_gradInput_data[k] = n_gradOutput_data[k] * w;
+ }
+ n_input_data += ks;
+ n_gradInput_data += ks;
+ n_gradOutput_data += ks;
+ }
+ }
+ THTensor_(free)(input);
+ THTensor_(free)(gradOutput);
+ THTensor_(free)(weight);
+ }
+}
+
+void THNN_(PReLU_accGradParameters)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *weight,
+ THTensor *gradWeight,
+ THTensor *gradWeightBuf,
+ THTensor *gradWeightBuf2,
+ THIndex_t nOutputPlane,
+ accreal scale_)
+{
+ real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+ THNN_CHECK_NELEMENT(input, gradOutput);
+
+ if (nOutputPlane == 0)
+ {
+ real *gradWeight_data = THTensor_(data)(gradWeight);
+ real sum = 0;
+ TH_TENSOR_APPLY2(real, input, real, gradOutput,
+ if ((*input_data) <= 0)
+ sum += (*input_data) * (*gradOutput_data);
+ );
+ gradWeight_data[0] += scale * sum;
+ }
+ else
+ {
+ THArgCheck(THTensor_(isContiguous)(gradWeight), 6, "gradWeight needs to be contiguous");
+ input = THTensor_(newContiguous)(input);
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+ weight = THTensor_(newContiguous)(weight);
+ long bs = 1, ks = 1;
+ {
+ long input_ndim = THTensor_(nDimension)(input);
+ if (input->size[input_ndim > 1] != nOutputPlane)
+ THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size[input_ndim > 1]);
+
+ if (input_ndim > 1) {
+ bs = input->size[0];
+ for (int d = 2; d < input_ndim; d++) {
+ ks *= input->size[d];
+ }
+ }
+ }
+
+ const real *input_data = THTensor_(data)(input);
+ const real *gradOutput_data = THTensor_(data)(gradOutput);
+ const real *weight_data = THTensor_(data)(weight);
+ real *gradWeight_data = THTensor_(data)(gradWeight);
+
+ THIndex_t i, j, k;
+ for (i = 0; i < bs; ++i)
+ {
+ const real *n_input_data = input_data + i*nOutputPlane*ks;
+ const real *n_gradOutput_data = gradOutput_data + i*nOutputPlane*ks;
+
+ for (j = 0; j < nOutputPlane; ++j)
+ {
+ real sum = 0;
+ for (k = 0; k < ks; ++k)
+ if (n_input_data[k] <= 0)
+ sum += n_gradOutput_data[k] * n_input_data[k];
+ gradWeight_data[j] += scale * sum;
+ n_input_data += ks;
+ n_gradOutput_data += ks;
+ }
+ }
+ THTensor_(free)(input);
+ THTensor_(free)(gradOutput);
+ THTensor_(free)(weight);
+ }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/RReLU.c b/contrib/lua-torch/nn/lib/THNN/generic/RReLU.c
new file mode 100644
index 000000000..8fd46d3c2
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/RReLU.c
@@ -0,0 +1,132 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/RReLU.c"
+#else
+
+void THNN_(RReLU_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THTensor *noise,
+ accreal lower_,
+ accreal upper_,
+ bool train,
+ bool inplace,
+ THGenerator *generator)
+{
+ real lower = TH_CONVERT_ACCREAL_TO_REAL(lower_);
+ real upper = TH_CONVERT_ACCREAL_TO_REAL(upper_);
+ if (train)
+ {
+ // get default random generator
+ THTensor_(resizeAs)(noise, input);
+ if (inplace)
+ {
+ TH_TENSOR_APPLY2(real, input, real, noise,
+ if (*input_data <= 0)
+ {
+ const real r = (real)THRandom_uniform(generator, lower, upper);
+ *input_data = (*input_data) * r;
+ *noise_data = r;
+ }
+ else
+ {
+ *noise_data = 1;
+ }
+ );
+ THTensor_(set)(output, input);
+ }
+ else
+ {
+ THTensor_(resizeAs)(output, input);
+ TH_TENSOR_APPLY3(real, input, real, output, real, noise,
+ if (*input_data <= 0)
+ {
+ const real r = (real)THRandom_uniform(generator, lower, upper);
+ *output_data = (*input_data) * r;
+ *noise_data = r;
+ }
+ else
+ {
+ *output_data = *input_data;
+ *noise_data = 1;
+ }
+ );
+ }
+ }
+ else
+ {
+ const real negSlope = (lower + upper) / 2;
+ if (inplace)
+ {
+ TH_TENSOR_APPLY(real, input,
+ if (*input_data <= 0)
+ {
+ *input_data = *input_data * negSlope;
+ }
+ );
+ THTensor_(set)(output, input);
+ }
+ else
+ {
+ THTensor_(resizeAs)(output, input);
+ TH_TENSOR_APPLY2(real, input, real, output,
+ const real r = (*input_data) <= 0 ? negSlope : 1;
+ *output_data = *input_data * r;
+ );
+ }
+ }
+}
+
+void THNN_(RReLU_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *noise,
+ accreal lower_,
+ accreal upper_,
+ bool train,
+ bool inplace)
+{
+ real lower = TH_CONVERT_ACCREAL_TO_REAL(lower_);
+ real upper = TH_CONVERT_ACCREAL_TO_REAL(upper_);
+ THNN_CHECK_NELEMENT(input, gradOutput);
+ if (train && upper - lower > 1E-6) // e.g. if upper == lower, RReLU behaves like LeakyReLU
+ {
+ // multiply the gradient by the noise tensor
+ if (inplace)
+ {
+ THTensor_(cmul)(gradOutput, gradOutput, noise);
+ THTensor_(set)(gradInput, gradOutput);
+ }
+ else
+ {
+ THTensor_(resizeAs)(gradInput, input);
+ THTensor_(cmul)(gradInput, gradOutput, noise);
+ }
+ }
+ else
+ {
+ // use constant factor for negative input values
+ const real negSlope = (lower + upper) / 2;
+ if (inplace)
+ {
+ TH_TENSOR_APPLY2(real, gradOutput, real, input,
+ if (*input_data <= 0)
+ {
+ *gradOutput_data = (*gradOutput_data) * negSlope;
+ }
+ );
+ THTensor_(set)(gradInput, gradOutput);
+ }
+ else
+ {
+ THTensor_(resizeAs)(gradInput, input);
+ TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+ *gradInput_data = (*input_data) <= 0 ? (*gradOutput_data) * negSlope : (*gradOutput_data);
+ );
+ }
+ }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/Sigmoid.c b/contrib/lua-torch/nn/lib/THNN/generic/Sigmoid.c
new file mode 100644
index 000000000..17fb2cb4d
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/Sigmoid.c
@@ -0,0 +1,28 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Sigmoid.c"
+#else
+
+void THNN_(Sigmoid_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output)
+{
+ THTensor_(sigmoid)(output, input);
+}
+
+void THNN_(Sigmoid_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *output)
+{
+ THNN_CHECK_NELEMENT(output, gradOutput);
+ THTensor_(resizeAs)(gradInput, output);
+ TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
+ real z = *output_data;
+ *gradInput_data = *gradOutput_data * (1. - z) * z;
+ );
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SmoothL1Criterion.c b/contrib/lua-torch/nn/lib/THNN/generic/SmoothL1Criterion.c
new file mode 100644
index 000000000..d1928d11c
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SmoothL1Criterion.c
@@ -0,0 +1,49 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SmoothL1Criterion.c"
+#else
+
+void THNN_(SmoothL1Criterion_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *target,
+ THTensor *output,
+ bool sizeAverage)
+{
+ THNN_CHECK_NELEMENT(input, target);
+ THNN_CHECK_DIM_SIZE(output, 1, 0, 1);
+
+ real sum = 0;
+ TH_TENSOR_APPLY2(real, input, real, target,
+ real z = fabs(*input_data - *target_data);
+ sum += z < 1 ? 0.5*z*z : z - 0.5;
+ );
+
+ if (sizeAverage)
+ sum /= THTensor_(nElement)(input);
+
+ THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(SmoothL1Criterion_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *target,
+ THTensor *gradInput,
+ bool sizeAverage)
+{
+ THNN_CHECK_NELEMENT(input, target);
+ real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
+
+ THTensor_(resizeAs)(gradInput, input);
+ TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+ real x = *input_data - *target_data;
+ if (x < -1.)
+ *gradInput_data = - norm;
+ else if (x > 1.)
+ *gradInput_data = norm;
+ else
+ *gradInput_data = norm * x;
+ );
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SoftMarginCriterion.c b/contrib/lua-torch/nn/lib/THNN/generic/SoftMarginCriterion.c
new file mode 100644
index 000000000..bac0a3b53
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SoftMarginCriterion.c
@@ -0,0 +1,44 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SoftMarginCriterion.c"
+#else
+
+void THNN_(SoftMarginCriterion_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *target,
+ THTensor *output,
+ bool sizeAverage)
+{
+ THNN_CHECK_NELEMENT(input, target);
+ THNN_CHECK_DIM_SIZE(output, 1, 0, 1);
+
+ real sum;
+
+ sum = 0;
+ TH_TENSOR_APPLY2(real, input, real, target,
+ real z = log(1. + exp(-*input_data* *target_data));
+ sum += z;)
+
+ if(sizeAverage)
+ sum /= THTensor_(nElement)(input);
+
+ THTensor_(set1d)(output, 0, sum);
+}
+
+void THNN_(SoftMarginCriterion_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *target,
+ THTensor *gradInput,
+ bool sizeAverage)
+{
+ THNN_CHECK_NELEMENT(input, target);
+ real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.);
+
+ THTensor_(resizeAs)(gradInput, input);
+ TH_TENSOR_APPLY3(real, gradInput, real, input, real, target,
+ real z = exp(-*target_data * *input_data);
+ *gradInput_data = -norm*(*target_data)*z/(1. + z);)
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SoftMax.c b/contrib/lua-torch/nn/lib/THNN/generic/SoftMax.c
new file mode 100644
index 000000000..7b60d64c2
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SoftMax.c
@@ -0,0 +1,150 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SoftMax.c"
+#else
+
+void THNN_(SoftMax_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output)
+{
+ real *input_data, *output_data;
+ ptrdiff_t nframe = 0, dim = 0, stride = 0;
+ ptrdiff_t t;
+
+ if (input->nDimension == 1)
+ {
+ nframe = 1;
+ dim = input->size[0];
+ stride = 1;
+ }
+ else if (input->nDimension == 2)
+ {
+ nframe = input->size[0];
+ dim = input->size[1];
+ stride = 1;
+ }
+ else if (input->nDimension == 3)
+ {
+ nframe = 1;
+ dim = input->size[0];
+ stride = input->size[1]*input->size[2];
+ }
+ else if (input->nDimension == 4)
+ {
+ nframe = input->size[0];
+ dim = input->size[1];
+ stride = input->size[2]*input->size[3];
+ }
+ else
+ {
+ THArgCheck(0, 2, "1D, 2D, 3D or 4D tensor expected");
+ }
+
+ input = THTensor_(newContiguous)(input);
+ THTensor_(resizeAs)(output, input);
+
+ input_data = THTensor_(data)(input);
+ output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(t)
+ for (t = 0; t < stride*nframe; t++)
+ {
+ real *input_ptr = input_data + (t/stride)*dim*stride + t % stride;
+ real *output_ptr = output_data + (t/stride)*dim*stride + t % stride;
+
+ real inputMax = -THInf;
+ accreal sum;
+
+ ptrdiff_t d;
+ for (d = 0; d < dim; d++)
+ {
+ if (input_ptr[d*stride] >= inputMax) inputMax = input_ptr[d*stride];
+ }
+
+ sum = 0;
+ for (d = 0; d < dim; d++)
+ {
+ real z = exp(input_ptr[d*stride] - inputMax);
+ output_ptr[d*stride] = z;
+ sum += z;
+ }
+
+ for (d = 0; d < dim; d++)
+ {
+ output_ptr[d*stride] *= 1/sum;
+ }
+ }
+
+ THTensor_(free)(input);
+}
+
+void THNN_(SoftMax_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *output)
+{
+ THNN_CHECK_SHAPE(input, gradOutput);
+ real *gradInput_data, *gradOutput_data, *output_data;
+ ptrdiff_t nframe = 0, dim = 0, stride = 0;
+ ptrdiff_t t;
+
+ if (output->nDimension == 1)
+ {
+ nframe = 1;
+ dim = output->size[0];
+ stride = 1;
+ }
+ else if (output->nDimension == 2)
+ {
+ nframe = output->size[0];
+ dim = output->size[1];
+ stride = 1;
+ }
+ else if (output->nDimension == 3)
+ {
+ nframe = 1;
+ dim = output->size[0];
+ stride = output->size[1]*output->size[2];
+ }
+ else if (output->nDimension == 4)
+ {
+ nframe = output->size[0];
+ dim = output->size[1];
+ stride = output->size[2]*output->size[3];
+ }
+ else
+ {
+ THError("1D, 2D, 3D or 4D tensor expected");
+ }
+
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+ output = THTensor_(newContiguous)(output);
+
+ THTensor_(resizeAs)(gradInput, output);
+ gradInput_data = THTensor_(data)(gradInput);
+ output_data = THTensor_(data)(output);
+ gradOutput_data = THTensor_(data)(gradOutput);
+
+#pragma omp parallel for private(t)
+ for (t = 0; t < stride*nframe; t++)
+ {
+ real *gradInput_ptr = gradInput_data + (t/stride)*dim*stride + t % stride;
+ real *output_ptr = output_data + (t/stride)*dim*stride + t % stride;
+ real *gradOutput_ptr = gradOutput_data + (t/stride)*dim*stride + t % stride;
+
+ ptrdiff_t d;
+ accreal sum = 0;
+ for (d = 0; d < dim; d++)
+ sum += (accreal)gradOutput_ptr[d*stride] * output_ptr[d*stride];
+
+ for (d = 0; d < dim; d++)
+ gradInput_ptr[d*stride] = output_ptr[d*stride] * (gradOutput_ptr[d*stride] - sum);
+ }
+
+ THTensor_(free)(gradOutput);
+ THTensor_(free)(output);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SoftPlus.c b/contrib/lua-torch/nn/lib/THNN/generic/SoftPlus.c
new file mode 100644
index 000000000..6491e66d6
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SoftPlus.c
@@ -0,0 +1,47 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SoftPlus.c"
+#else
+
+void THNN_(SoftPlus_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ accreal beta_,
+ accreal threshold_)
+{
+ real beta = TH_CONVERT_ACCREAL_TO_REAL(beta_);
+ real threshold = TH_CONVERT_ACCREAL_TO_REAL(threshold_);
+ THTensor_(resizeAs)(output, input);
+
+ // f(x) = 1/beta * log(1 + exp(beta * x))
+ TH_TENSOR_APPLY2(real, output, real, input, \
+ *output_data = (*input_data * beta) > threshold ? *input_data : THLog1p(exp(*input_data * beta)) / beta;
+ );
+}
+
+void THNN_(SoftPlus_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *output,
+ accreal beta_,
+ accreal threshold_)
+{
+ real beta = TH_CONVERT_ACCREAL_TO_REAL(beta_);
+ real threshold = TH_CONVERT_ACCREAL_TO_REAL(threshold_);
+ THNN_CHECK_NELEMENT(input, gradOutput);
+ THTensor_(resizeAs)(gradInput, output);
+
+ // d/dx[log(1+exp(k*x))/k] = exp(kx) / (exp(kx) + 1)
+ // SINCE
+ // y = (1/k)*log(1+exp(k*x)) --> x = (1/k)*log(exp(k*y)-1)
+ // THEREFORE:
+ // d/dx(f(x)) = (exp(k*y) - 1) / exp(k*y)
+ TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
+ real z = exp(*output_data * beta);
+ *gradInput_data = (*output_data * beta) > threshold ? *gradOutput_data : *gradOutput_data * (z - 1.)/z;
+ );
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SoftShrink.c b/contrib/lua-torch/nn/lib/THNN/generic/SoftShrink.c
new file mode 100644
index 000000000..e77950868
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SoftShrink.c
@@ -0,0 +1,42 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SoftShrink.c"
+#else
+
+void THNN_(SoftShrink_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ accreal lambda_)
+{
+ real lambda = TH_CONVERT_ACCREAL_TO_REAL(lambda_);
+ THTensor_(resizeAs)(output, input);
+
+ TH_TENSOR_APPLY2(real, output, real, input,
+ if ((*input_data) > lambda)
+ *output_data = *input_data - lambda;
+ else if ((*input_data) < -lambda)
+ *output_data = *input_data + lambda;
+ else
+ *output_data = 0;
+ );
+}
+
+void THNN_(SoftShrink_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ accreal lambda_)
+{
+ real lambda = TH_CONVERT_ACCREAL_TO_REAL(lambda_);
+ THNN_CHECK_NELEMENT(input, gradOutput);
+ THTensor_(resizeAs)(gradInput, input);
+ TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+ if ((*input_data) > lambda || (*input_data) < -lambda)
+ *gradInput_data = (*gradOutput_data);
+ else
+ *gradInput_data = 0;
+ );
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SparseLinear.c b/contrib/lua-torch/nn/lib/THNN/generic/SparseLinear.c
new file mode 100644
index 000000000..1cf712212
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SparseLinear.c
@@ -0,0 +1,564 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SparseLinear.c"
+#else
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#define ROW_PTR2(t, r) (THTensor_(data)(t) + (r) * (t)->stride[0])
+#define COL_PTR2(t, c) (THTensor_(data)(t) + (c) * (t)->stride[1])
+
+static bool THNN_(checkLegacyInput)(THTensor* t)
+{
+ return t->nDimension == 3 && t->size[2] == 2;
+}
+
+static bool THNN_(checkInput)(THTensor* t)
+{
+ return t->nDimension == 2 && t->size[1] == 3;
+}
+
+static bool THNN_(checkSize2D)(THTensor* t, long size0, long size1)
+{
+ return t->nDimension == 2 && t->size[0] == size0 && t->size[1] == size1;
+}
+
+static bool THNN_(checkSize1D)(THTensor* t, long size0)
+{
+ return t->nDimension == 1 && t->size[0] == size0;
+}
+
+static void THNN_(set1d)(THTensor *t, long x0, real value) {
+ THStorage_(set)(t->storage, t->storageOffset + x0*t->stride[0], value);
+}
+static real THNN_(get3d)(const THTensor *t, long x0, long x1, long x2) {
+ return THStorage_(get)(t->storage, t->storageOffset +
+ x0*t->stride[0] + x1*t->stride[1] + x2*t->stride[2]);
+}
+static real THNN_(get2d)(const THTensor *t, long x0, long x1) {
+ return THStorage_(get)(t->storage, t->storageOffset +
+ x0*t->stride[0] + x1*t->stride[1]);
+}
+
+void THNN_(SparseLinear_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THTensor *weight,
+ THTensor *bias)
+{
+ long h, i, j, hp0, hp1;
+ long outDim = THTensor_(size)(weight, 0);
+ long inDim = THTensor_(size)(weight, 1);
+ long batchSize = THTensor_(size)(output, 0);
+
+ THArgCheck(THNN_(checkInput)(input), 2, "input must be in coo format, nnz x 3");
+ THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous");
+ THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong");
+
+ long nnz = THTensor_(size)(input, 0);
+
+ THLongTensor * csr = THLongTensor_newWithSize1d(batchSize+1);
+ THLongTensor_zero(csr);
+
+ weight = THTensor_(newContiguous)(weight);
+
+//#pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000)
+ for (i=0; i<nnz; i++) {
+ hp0 = (long)(THNN_(get2d)(input, i, 0)) - 1;
+ hp1 = (i+1 == nnz) ?
+ batchSize :
+ (long)(THNN_(get2d)(input, i+1, 0)) - 1;
+ if (hp0 != hp1) for (h = hp0; h < hp1; h++) {
+ THLongTensor_set1d(csr, h+1, i+1);
+ }
+ }
+
+
+ // output = weight * input + bias
+ THTensor_(zero)(output);
+#pragma omp parallel for private(h, i) schedule(static) if (nnz > 10000)
+ for (h = 0; h < batchSize; h++) {
+ long i_start = THLongTensor_get1d(csr, h);
+ long i_end = THLongTensor_get1d(csr, h+1);
+ for (i = i_start; i < i_end; i++) {
+ real val = THNN_(get2d)(input, i, 2);
+ if (val == 0) {
+ continue;
+ }
+
+ long offset = (long)(THNN_(get2d)(input, i, 1)) - 1;
+ if (offset >= 0 && offset < inDim) {
+ THBlas_(axpy)(outDim,
+ val,
+ COL_PTR2(weight, offset), weight->stride[0],
+ ROW_PTR2(output, h), output->stride[1]);
+ } else {
+ THError("index out of bound. updateOutput: %d not between 1 and %d",
+ offset + 1, inDim);
+ }
+ }
+ }
+
+ THTensor* output_row = THTensor_(new)();
+ for (h = 0; h < batchSize; h++) {
+ THTensor_(select)(output_row, output, 0, h);
+ THTensor_(cadd)(output_row, bias, 1.0, output_row);
+ }
+ THTensor_(free)(output_row);
+ THLongTensor_free(csr);
+ THTensor_(free)(weight);
+}
+
+void THNN_(SparseLinear_legacyUpdateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THTensor *weight,
+ THTensor *bias)
+{
+ long h, i;
+ long outDim = THTensor_(size)(weight, 0);
+ long inDim = THTensor_(size)(weight, 1);
+
+ THArgCheck(THNN_(checkLegacyInput)(input), 2, "input size must be batchsize x nnz x 2");
+ THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous");
+ THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong");
+
+ weight = THTensor_(newContiguous)(weight);
+
+ long batchSize = THTensor_(size)(input, 0);
+ long nnz = THTensor_(size)(input, 1);
+ THTensor_(resize2d)(output, batchSize, outDim);
+
+ // output = weight * input + bias
+ THTensor_(zero)(output);
+#pragma omp parallel for private(h, i) schedule(static) if ( \
+ batchSize > 1 && batchSize * nnz * outDim > 10000)
+ for (h = 0; h < batchSize; h++) {
+ for (i = 0; i < nnz; i++) {
+ real val = THNN_(get3d)(input, h, i, 1);
+ if (val == 0) {
+ continue;
+ }
+
+ long offset = (long)(THNN_(get3d)(input, h, i, 0)) - 1;
+ if (offset >= 0 && offset < inDim) {
+ THBlas_(axpy)(outDim,
+ val,
+ COL_PTR2(weight, offset), weight->stride[0],
+ ROW_PTR2(output, h), output->stride[1]);
+ } else {
+ THError("index out of bound. updateOutput: %d not between 1 and %d",
+ offset + 1, inDim);
+ }
+ }
+ }
+
+ THTensor* output_row = THTensor_(new)();
+ for (h = 0; h < batchSize; h++) {
+ THTensor_(select)(output_row, output, 0, h);
+ THTensor_(cadd)(output_row, bias, 1.0, output_row);
+ }
+ THTensor_(free)(output_row);
+ THTensor_(free)(weight);
+}
+
+void THNN_(SparseLinear_accGradParameters)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradWeight,
+ THTensor *gradBias,
+ THTensor *weight,
+ THTensor *bias,
+ accreal weightDecay_,
+ accreal scale_)
+{
+ real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
+ real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+ long h, i, col, hp0, hp1;
+ long outDim = THTensor_(size)(weight, 0);
+ long inDim = THTensor_(size)(weight, 1);
+
+ THArgCheck(THNN_(checkInput)(input), 2,
+ "input must be in coo format, nnz x 3");
+ THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
+ "gradWeight size wrong");
+ THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5,
+ "gradBias size wrong");
+ THArgCheck(THTensor_(isContiguous)(gradOutput), 1,
+ "gradOutput must be contiguous");
+
+ long nnz = THTensor_(size)(input, 0);
+
+ THLongTensor* csc = THLongTensor_newWithSize1d(inDim+1);
+ THLongTensor_zero(csc);
+ weight = THTensor_(newContiguous)(weight);
+
+#pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000)
+ for (i = 0; i < nnz; i++) {
+ hp0 = (long)(THNN_(get2d)(input, i, 1)) - 1;
+ hp1 = (i+1 == nnz) ?
+ inDim :
+ (long)(THNN_(get2d)(input, i+1, 1)) - 1;
+ if (hp0 != hp1) for (h = hp0; h < hp1; h++) {
+ THLongTensor_set1d(csc, h+1, i+1);
+ }
+ }
+
+ // gradWeight += gradOutput * input
+#pragma omp parallel for private(h, i, col) schedule(static) if (nnz > 10000)
+ for (col = 0; col < inDim; col++) {
+ long i_start = THLongTensor_get1d(csc, col);
+ long i_end = THLongTensor_get1d(csc, col+1);
+ for (i = i_start; i < i_end; i++) {
+ real val = scale * THNN_(get2d)(input, i, 2);
+
+ h = (long)(THNN_(get2d)(input, i, 0)) - 1;
+ long offset = (long)(THNN_(get2d)(input, i, 1)) - 1;
+ if (offset >= 0 && offset < inDim) {
+ THBlas_(axpy)(outDim,
+ val,
+ ROW_PTR2(gradOutput, h), gradOutput->stride[1],
+ COL_PTR2(gradWeight, offset), gradWeight->stride[0]);
+ } else {
+ THError(
+ "index out of bound. accGradParameters: %d not between 1 and %d",
+ offset + 1,
+ inDim);
+ }
+ }
+ }
+
+ // gradBias += gradOutput
+ THTensor* buf = THTensor_(new)();
+ THTensor_(sum)(buf, gradOutput, 0, 1);
+ THTensor_(cadd)(gradBias, gradBias, scale, buf);
+ THTensor_(free)(buf);
+ THLongTensor_free(csc);
+
+ if (weightDecay != 0) {
+ THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight);
+ }
+ THTensor_(free)(weight);
+}
+
+void THNN_(SparseLinear_legacyAccGradParameters)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradWeight,
+ THTensor *gradBias,
+ THTensor *weight,
+ THTensor *bias,
+ accreal weightDecay_,
+ accreal scale_)
+{
+ real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
+ real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+ long h, i;
+ long outDim = THTensor_(size)(weight, 0);
+ long inDim = THTensor_(size)(weight, 1);
+
+ THArgCheck(THNN_(checkLegacyInput)(input), 2,
+ "input size must be batchsize x nnz x 2");
+ THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
+ "gradWeight size wrong");
+ THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5,
+ "gradBias size wrong");
+ THArgCheck(THTensor_(isContiguous)(gradOutput), 1,
+ "gradOutput must be contiguous");
+
+ long batchSize = THTensor_(size)(input, 0);
+ long nnz = THTensor_(size)(input, 1);
+ THTensor_(resize2d)(gradOutput, batchSize, outDim);
+
+ // gradWeight += gradOutput * input
+#pragma omp parallel for private(h, i) schedule(static) if (\
+ batchSize * nnz * outDim > 10000)
+ for (i = 0; i < nnz; i++) {
+ for (h = 0; h < batchSize; h++) {
+ real val = scale * THNN_(get3d)(input, h, i, 1);
+ if (val == 0) {
+ continue;
+ }
+
+ long offset = (long)(THNN_(get3d)(input, h, i, 0)) - 1;
+ if (offset >= 0 && offset < inDim) {
+ THBlas_(axpy)(outDim,
+ val,
+ ROW_PTR2(gradOutput, h), gradOutput->stride[1],
+ COL_PTR2(gradWeight, offset), gradWeight->stride[0]);
+ } else {
+ THError(
+ "index out of bound. accGradParameters: %d not between 1 and %d",
+ offset + 1,
+ inDim);
+ }
+ }
+ }
+
+ // gradBias += gradOutput
+ THTensor* gradOutput_row = THTensor_(new)();
+ for (h = 0; h < batchSize; h++) {
+ THTensor_(select)(gradOutput_row, gradOutput, 0, h);
+ THTensor_(cadd)(gradBias, gradBias, scale, gradOutput_row);
+ }
+ THTensor_(free)(gradOutput_row);
+
+ if (weightDecay != 0) {
+ THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight);
+ }
+}
+
+void THNN_(SparseLinear_updateParameters)(
+ THNNState *state,
+ THTensor *weight,
+ THTensor *bias,
+ THTensor *gradWeight,
+ THTensor *gradBias,
+ THTensor *lastInput,
+ accreal learningRate_)
+{
+ real learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_);
+ long h, i;
+ long outDim = weight->size[0];
+ long inDim = weight->size[1];
+
+ THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
+ "gradWeight size wrong");
+ THArgCheck(THNN_(checkSize1D)(bias, outDim), 3, "bias size wrong");
+ THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong");
+ THArgCheck(THNN_(checkInput)(lastInput), 6,
+ "input must be in coo format, nnz x 3");
+
+
+ long nnz = THTensor_(size)(lastInput, 0);
+
+ // collect unique offsets of non-0 val in input
+ THTensor* offsets = THTensor_(newWithSize1d)(nnz);
+ long cnt = 0;
+ for (i = 0; i < nnz; i++) {
+ real val = THNN_(get2d)(lastInput, i, 2);
+ if (val == 0) {
+ continue;
+ }
+ long offset = (long)(THNN_(get2d)(lastInput, i, 1)) - 1;
+ if (offset >= 0 && offset < inDim) {
+ THNN_(set1d)(offsets, cnt++, offset);
+ } else {
+ THError(
+ "index out of bound. updateParameters: %d not between 1 and %d",
+ offset + 1,
+ inDim);
+ }
+ }
+ if (cnt == 0) return;
+ THTensor_(resize1d)(offsets, cnt);
+
+ THTensor* uniqueOffsets = THTensor_(new)();
+ THLongTensor* ri = THLongTensor_new();
+ THTensor_(sort)(uniqueOffsets, ri, offsets, 0, 0);
+ THLongTensor_free(ri);
+ THTensor_(free)(offsets);
+
+ cnt = 1;
+ real* uniqueOffsets_p = THTensor_(data)(uniqueOffsets);
+ for (i = 1; i < THTensor_(size)(uniqueOffsets, 0); i++) {
+ if (uniqueOffsets_p[i] != uniqueOffsets_p[i - 1]) {
+ uniqueOffsets_p[cnt++] = uniqueOffsets_p[i];
+ }
+ }
+ THTensor_(resize1d)(uniqueOffsets, cnt);
+
+ // weight += -learningRate * gradWeight
+ THTensor_(cadd)(bias, bias, -learningRate, gradBias);
+#pragma omp parallel for private(i) schedule(static) if (cnt * outDim > 10000)
+ for (i = 0; i < cnt; i++) {
+ long offset = (long)uniqueOffsets_p[i];
+ THBlas_(axpy)(outDim,
+ -learningRate,
+ COL_PTR2(gradWeight, offset), gradWeight->stride[0],
+ COL_PTR2(weight, offset), weight->stride[0]);
+ }
+
+ THTensor_(free)(uniqueOffsets);
+}
+
+void THNN_(SparseLinear_legacyUpdateParameters)(
+ THNNState *state,
+ THTensor *weight,
+ THTensor *bias,
+ THTensor *gradWeight,
+ THTensor *gradBias,
+ THTensor *lastInput,
+ accreal learningRate_)
+{
+ real learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_);
+ long h, i;
+ long outDim = weight->size[0];
+ long inDim = weight->size[1];
+
+ THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4,
+ "gradWeight size wrong");
+ THArgCheck(THNN_(checkSize1D)(bias, outDim), 3, "bias size wrong");
+ THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong");
+ THArgCheck(THNN_(checkLegacyInput)(lastInput), 6,
+ "input size must be batchsize x nnz x 2");
+
+
+ long batchSize = THTensor_(size)(lastInput, 0);
+ long nnz = THTensor_(size)(lastInput, 1);
+
+ // collect unique offsets of non-0 val in input
+ THTensor* offsets = THTensor_(newWithSize1d)(batchSize * nnz);
+ long cnt = 0;
+ for (h = 0; h < batchSize; h++) {
+ for (i = 0; i < nnz; i++) {
+ real val = THNN_(get3d)(lastInput, h, i, 1);
+ if (val == 0 ) {
+ continue;
+ }
+ long offset = (long)(THNN_(get3d)(lastInput, h, i, 0)) - 1;
+ if (offset >= 0 && offset < inDim) {
+ THNN_(set1d)(offsets, cnt++, offset);
+ } else {
+ THError(
+ "index out of bound. updateParameters: %d not between 1 and %d",
+ offset + 1,
+ inDim);
+ }
+ }
+ }
+ THTensor_(resize1d)(offsets, cnt);
+
+ THTensor* uniqueOffsets = THTensor_(new)();
+ THLongTensor* ri = THLongTensor_new();
+ THTensor_(sort)(uniqueOffsets, ri, offsets, 0, 0);
+ THLongTensor_free(ri);
+ THTensor_(free)(offsets);
+
+ cnt = 1;
+ real* uniqueOffsets_p = THTensor_(data)(uniqueOffsets);
+ for (i = 1; i < THTensor_(size)(uniqueOffsets, 0); i++) {
+ if (uniqueOffsets_p[i] != uniqueOffsets_p[i - 1]) {
+ uniqueOffsets_p[cnt++] = uniqueOffsets_p[i];
+ }
+ }
+ THTensor_(resize1d)(uniqueOffsets, cnt);
+
+ // weight += -learningRate * gradWeight
+ THTensor_(cadd)(bias, bias, -learningRate, gradBias);
+#pragma omp parallel for private(i) schedule(static) if (cnt * outDim > 10000)
+ for (i = 0; i < cnt; i++) {
+ long offset = (long)uniqueOffsets_p[i];
+ THBlas_(axpy)(outDim,
+ -learningRate,
+ COL_PTR2(gradWeight, offset), gradWeight->stride[0],
+ COL_PTR2(weight, offset), weight->stride[0]);
+ }
+
+ THTensor_(free)(uniqueOffsets);
+}
+
+void THNN_(SparseLinear_zeroGradParameters)(
+ THNNState *state,
+ THTensor *gradWeight,
+ THTensor *gradBias,
+ THTensor *lastInput)
+{
+ long h, i, j;
+
+ long outDim = gradWeight->size[0];
+ long inDim = gradWeight->size[1];
+
+ THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 3, "gradBias size wrong");
+ THArgCheck(THNN_(checkInput)(lastInput), 4,
+ "input must be in coo format, nnz x 3");
+
+ THTensor_(zero)(gradBias);
+
+ long nnz = THTensor_(size)(lastInput, 0);
+
+#pragma omp parallel for private(i, j) schedule(static) if ( \
+ nnz * outDim > 10000)
+ for (i = 0; i < nnz; i++) {
+ if (THNN_(get2d)(lastInput, i, 2) == 0 ) {
+ continue;
+ }
+
+ long offset = (long)(THNN_(get2d)(lastInput, i, 1)) - 1;
+ if (offset >= 0 && offset < inDim) {
+ real* pGradWeight = COL_PTR2(gradWeight, offset);
+ if (gradWeight->stride[0] == 1) {
+ THVector_(fill)(pGradWeight, 0, outDim);
+ } else {
+ long stride = gradWeight->stride[0];
+ for (j = 0; j < outDim; ++j) {
+ pGradWeight[j * stride] = 0;
+ }
+ }
+ } else {
+ THError(
+ "index out of bound. zeroGradParameters: %d not between 1 and %d",
+ offset + 1,
+ inDim);
+ }
+ }
+}
+
+void THNN_(SparseLinear_legacyZeroGradParameters)(
+ THNNState *state,
+ THTensor *gradWeight,
+ THTensor *gradBias,
+ THTensor *lastInput)
+{
+ long h, i, j;
+
+ long outDim = gradWeight->size[0];
+ long inDim = gradWeight->size[1];
+
+ THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 3, "gradBias size wrong");
+ THArgCheck(THNN_(checkLegacyInput)(lastInput), 4,
+ "input size must be batchsize x nnz x 2");
+
+ THTensor_(zero)(gradBias);
+
+ long batchSize = THTensor_(size)(lastInput, 0);
+ long nnz = THTensor_(size)(lastInput, 1);
+
+#pragma omp parallel for private(h, i, j) schedule(static) if ( \
+ batchSize > 1 && batchSize * nnz * outDim > 10000)
+ for (h = 0; h < batchSize; h++) {
+ for (i = 0; i < nnz; i++) {
+ if (THNN_(get3d)(lastInput, h, i, 1) == 0 ) {
+ continue;
+ }
+
+ long offset = (long)(THNN_(get3d)(lastInput, h, i, 0)) - 1;
+ if (offset >= 0 && offset < inDim) {
+ real* pGradWeight = COL_PTR2(gradWeight, offset);
+ if (gradWeight->stride[0] == 1) {
+ THVector_(fill)(pGradWeight, 0, outDim);
+ } else {
+ long stride = gradWeight->stride[0];
+ for (j = 0; j < outDim; ++j) {
+ pGradWeight[j * stride] = 0;
+ }
+ }
+ } else {
+ THError(
+ "index out of bound. zeroGradParameters: %d not between 1 and %d",
+ offset + 1,
+ inDim);
+ }
+ }
+ }
+}
+
+#undef ROW_PTR2
+#undef COL_PTR2
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialAdaptiveAveragePooling.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialAdaptiveAveragePooling.c
new file mode 100644
index 000000000..3675b42d7
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialAdaptiveAveragePooling.c
@@ -0,0 +1,258 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialAdaptiveAveragePooling.c"
+#else
+
+#define START_IND(a,b,c) (int)floor((float)(a * c) / b)
+#define END_IND(a,b,c) (int)ceil((float)((a + 1) * c) / b)
+// #define START_IND(a,b,c) a * c / b
+// #define END_IND(a,b,c) (a + 1) * c / b + ((a + 1) * c % b > 0)?1:0
+
+static void THNN_(SpatialAdaptiveAveragePooling_updateOutput_frame)(
+ real *input_p,
+ real *output_p,
+ long nslices,
+ long iwidth,
+ long iheight,
+ long owidth,
+ long oheight,
+ long stridew,
+ long strideh,
+ long strided)
+{
+ long k;
+#pragma omp parallel for private(k)
+ for (k = 0; k < nslices; k++)
+ {
+ /* loop over output */
+ long i, j;
+ for(i = 0; i < oheight; i++)
+ {
+ int y_start = START_IND(i, oheight, iheight);
+ int y_end = END_IND(i, oheight, iheight);
+ int kH = y_end-y_start;
+
+ for(j = 0; j < owidth; j++)
+ {
+
+ int x_start = START_IND(j, owidth, iwidth);
+ int x_end = END_IND(j, owidth, iwidth);
+ int kW = x_end-x_start;
+
+ /* local pointers */
+ real *ip = input_p + k*strided + y_start*strideh + x_start*stridew;
+ real *op = output_p + k*owidth*oheight + i*owidth + j;
+
+ /* compute local average: */
+ real sum = 0;
+ int x,y;
+ for(y = 0; y < kH; y++)
+ {
+ for(x = 0; x < kW; x++)
+ {
+ real val = *(ip + y*strideh + x*stridew);
+ sum += val;
+ }
+ }
+
+ /* set output to local average */
+ *op = sum / kW / kH;
+ }
+ }
+ }
+}
+
+void THNN_(SpatialAdaptiveAveragePooling_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ int owidth,
+ int oheight)
+{
+ int dimw = 2;
+ int dimh = 1;
+ long nbatch = 1;
+ long nslices;
+ long iheight;
+ long iwidth;
+
+ long istride_d;
+ long istride_h;
+ long istride_w;
+ long istride_b;
+
+ real *input_data;
+ real *output_data;
+
+
+ THNN_ARGCHECK(input->nDimension == 3 || input->nDimension == 4, 2, input,
+ "3D or 4D (batch mode) tensor expected for input, but got: %s");
+
+ if (input->nDimension == 4)
+ {
+ istride_b = input->stride[0];
+ nbatch = input->size[0];
+ dimw++;
+ dimh++;
+ }
+
+ /* sizes */
+ nslices = input->size[dimh-1];
+ iheight = input->size[dimh];
+ iwidth = input->size[dimw];
+ /* strides */
+ istride_d = input->stride[dimh-1];
+ istride_h = input->stride[dimh];
+ istride_w = input->stride[dimw];
+
+ /* resize output */
+ if (input->nDimension == 3)
+ {
+ THTensor_(resize3d)(output, nslices, oheight, owidth);
+
+ input_data = THTensor_(data)(input);
+ output_data = THTensor_(data)(output);
+
+ THNN_(SpatialAdaptiveAveragePooling_updateOutput_frame)(input_data, output_data,
+ nslices,
+ iwidth, iheight,
+ owidth, oheight,
+ istride_w,istride_h,
+ istride_d);
+ }
+ else
+ {
+ long p;
+
+ THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
+
+ input_data = THTensor_(data)(input);
+ output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(p)
+ for (p = 0; p < nbatch; p++)
+ {
+ THNN_(SpatialAdaptiveAveragePooling_updateOutput_frame)(input_data+p*istride_b, output_data+p*nslices*owidth*oheight,
+ nslices,
+ iwidth, iheight,
+ owidth, oheight,
+ istride_w,istride_h,
+ istride_d);
+ }
+ }
+}
+
+static void THNN_(SpatialAdaptiveAveragePooling_updateGradInput_frame)(
+ real *gradInput_p,
+ real *gradOutput_p,
+ long nslices,
+ long iwidth,
+ long iheight,
+ long owidth,
+ long oheight)
+{
+ long k;
+#pragma omp parallel for private(k)
+ for (k = 0; k < nslices; k++)
+ {
+ real *gradInput_p_k = gradInput_p + k*iwidth*iheight;
+ real *gradOutput_p_k = gradOutput_p + k*owidth*oheight;
+
+ /* calculate average */
+ long i, j;
+ for(i = 0; i < oheight; i++)
+ {
+ int y_start = START_IND(i, oheight, iheight);
+ int y_end = END_IND(i, oheight, iheight);
+ int kH = y_end-y_start;
+
+ for(j = 0; j < owidth; j++)
+ {
+
+ int x_start = START_IND(j, owidth, iwidth);
+ int x_end = END_IND(j, owidth, iwidth);
+ int kW = x_end-x_start;
+
+ int x,y;
+ for(y = y_start; y < y_end; y++)
+ {
+ for(x = x_start; x < x_end; x++)
+ {
+ /* update gradient */
+ gradInput_p_k[y*iwidth + x] += gradOutput_p_k[i*owidth + j] / kW / kH;
+ }
+ }
+ }
+ }
+ }
+}
+
+void THNN_(SpatialAdaptiveAveragePooling_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput)
+{
+ int dimw = 2;
+ int dimh = 1;
+ long nbatch = 1;
+ int nslices;
+ int iheight;
+ int iwidth;
+ int oheight;
+ int owidth;
+ real *gradInput_data;
+ real *gradOutput_data;
+
+ /* get contiguous gradOutput */
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+
+ /* resize */
+ THTensor_(resizeAs)(gradInput, input);
+ THTensor_(zero)(gradInput);
+
+ if (input->nDimension == 4) {
+ nbatch = input->size[0];
+ dimw++;
+ dimh++;
+ }
+
+ /* sizes */
+ nslices = input->size[dimh-1];
+ iheight = input->size[dimh];
+ iwidth = input->size[dimw];
+ oheight = gradOutput->size[dimh];
+ owidth = gradOutput->size[dimw];
+
+ /* get raw pointers */
+ gradInput_data = THTensor_(data)(gradInput);
+ gradOutput_data = THTensor_(data)(gradOutput);
+
+ /* backprop */
+ if (input->nDimension == 3)
+ {
+ THNN_(SpatialAdaptiveAveragePooling_updateGradInput_frame)(gradInput_data, gradOutput_data,
+ nslices,
+ iwidth, iheight,
+ owidth, oheight);
+ }
+ else
+ {
+ long p;
+#pragma omp parallel for private(p)
+ for (p = 0; p < nbatch; p++)
+ {
+ THNN_(SpatialAdaptiveAveragePooling_updateGradInput_frame)(gradInput_data+p*nslices*iwidth*iheight, gradOutput_data+p*nslices*owidth*oheight,
+ nslices,
+ iwidth, iheight,
+ owidth, oheight);
+ }
+ }
+
+ /* cleanup */
+ THTensor_(free)(gradOutput);
+}
+
+#endif
+
+#undef START_IND
+#undef END_IND \ No newline at end of file
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialAdaptiveMaxPooling.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialAdaptiveMaxPooling.c
new file mode 100644
index 000000000..fff716e67
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialAdaptiveMaxPooling.c
@@ -0,0 +1,274 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialAdaptiveMaxPooling.c"
+#else
+
+static void THNN_(SpatialAdaptiveMaxPooling_updateOutput_frame)(
+ real *input_p,
+ real *output_p,
+ THIndex_t *indx_p,
+ THIndex_t *indy_p,
+ long nslices,
+ long iwidth,
+ long iheight,
+ long owidth,
+ long oheight,
+ long stridew,
+ long strideh,
+ long strided)
+{
+ long k;
+#pragma omp parallel for private(k)
+ for (k = 0; k < nslices; k++)
+ {
+ /* loop over output */
+ long i, j;
+ for(i = 0; i < oheight; i++)
+ {
+ int y_start = (int)floor((float)i / oheight * iheight);
+ int y_end = (int)ceil((float)(i + 1) / oheight * iheight);
+ int kH = y_end-y_start;
+
+ for(j = 0; j < owidth; j++)
+ {
+
+ int x_start = (int)floor((float)j / owidth * iwidth);
+ int x_end = (int)ceil((float)(j + 1) / owidth * iwidth);
+ int kW = x_end-x_start;
+
+ /* local pointers */
+ real *ip = input_p + k*strided + y_start*strideh + x_start*stridew;
+ real *op = output_p + k*owidth*oheight + i*owidth + j;
+ THIndex_t *indyp = indy_p + k*owidth*oheight + i*owidth + j;
+ THIndex_t *indxp = indx_p + k*owidth*oheight + i*owidth + j;
+
+ /* compute local max: */
+ long maxindex = -1;
+ real maxval = -FLT_MAX;
+ long tcntr = 0;
+ int x,y;
+ for(y = 0; y < kH; y++)
+ {
+ for(x = 0; x < kW; x++)
+ {
+ real val = *(ip + y*strideh + x*stridew);
+ if (val > maxval)
+ {
+ maxval = val;
+ maxindex = tcntr;
+ }
+ tcntr++;
+ }
+ }
+
+ /* set output to local max */
+ *op = maxval;
+
+ /* store location of max (x,y) */
+ *indyp = (maxindex / kW) + TH_INDEX_BASE;
+ *indxp = (maxindex % kW) + TH_INDEX_BASE;
+ }
+ }
+ }
+}
+
+void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THIndexTensor *indices,
+ int owidth,
+ int oheight)
+{
+ int dimw = 2;
+ int dimh = 1;
+ long nbatch = 1;
+ long nslices;
+ long iheight;
+ long iwidth;
+
+ long istride_d;
+ long istride_h;
+ long istride_w;
+ long istride_b;
+
+ real *input_data;
+ real *output_data;
+ THIndex_t *indices_data;
+
+
+ THNN_ARGCHECK(input->nDimension == 3 || input->nDimension == 4, 2, input,
+ "3D or 4D (batch mode) tensor expected for input, but got: %s");
+
+ if (input->nDimension == 4)
+ {
+ istride_b = input->stride[0];
+ nbatch = input->size[0];
+ dimw++;
+ dimh++;
+ }
+
+ /* sizes */
+ nslices = input->size[dimh-1];
+ iheight = input->size[dimh];
+ iwidth = input->size[dimw];
+ /* strides */
+ istride_d = input->stride[dimh-1];
+ istride_h = input->stride[dimh];
+ istride_w = input->stride[dimw];
+
+ /* resize output */
+ if (input->nDimension == 3)
+ {
+ THTensor_(resize3d)(output, nslices, oheight, owidth);
+ /* indices will contain i,j locations for each output point */
+ THIndexTensor_(resize4d)(indices, 2, nslices, oheight, owidth);
+
+ input_data = THTensor_(data)(input);
+ output_data = THTensor_(data)(output);
+ indices_data = THIndexTensor_(data)(indices);
+
+ THNN_(SpatialAdaptiveMaxPooling_updateOutput_frame)(input_data, output_data,
+ indices_data+nslices*owidth*oheight, indices_data,
+ nslices,
+ iwidth, iheight,
+ owidth, oheight,
+ istride_w,istride_h,
+ istride_d);
+ }
+ else
+ {
+ long p;
+
+ THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
+ /* indices will contain i,j locations for each output point */
+ THIndexTensor_(resize5d)(indices, 2, nbatch, nslices, oheight, owidth);
+
+ input_data = THTensor_(data)(input);
+ output_data = THTensor_(data)(output);
+ indices_data = THIndexTensor_(data)(indices);
+
+#pragma omp parallel for private(p)
+ for (p = 0; p < nbatch; p++)
+ {
+ THNN_(SpatialAdaptiveMaxPooling_updateOutput_frame)(input_data+p*istride_b, output_data+p*nslices*owidth*oheight,
+ indices_data+(p+nbatch)*nslices*owidth*oheight, indices_data+p*nslices*owidth*oheight,
+ nslices,
+ iwidth, iheight,
+ owidth, oheight,
+ istride_w,istride_h,
+ istride_d);
+ }
+ }
+}
+
+static void THNN_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(
+ real *gradInput_p,
+ real *gradOutput_p,
+ THIndex_t *indx_p,
+ THIndex_t *indy_p,
+ long nslices,
+ long iwidth,
+ long iheight,
+ long owidth,
+ long oheight)
+{
+ long k;
+#pragma omp parallel for private(k)
+ for (k = 0; k < nslices; k++)
+ {
+ real *gradInput_p_k = gradInput_p + k*iwidth*iheight;
+ real *gradOutput_p_k = gradOutput_p + k*owidth*oheight;
+ THIndex_t *indx_p_k = indx_p + k*owidth*oheight;
+ THIndex_t *indy_p_k = indy_p + k*owidth*oheight;
+
+ /* calculate max points */
+ long i, j;
+ for(i = 0; i < oheight; i++)
+ {
+ int y_start = (int)floor((float) i / oheight * iheight);
+ for(j = 0; j < owidth; j++)
+ {
+ int x_start = (int)floor((float) j / owidth * iwidth);
+ /* retrieve position of max */
+ long maxi = indy_p_k[i*owidth + j] - TH_INDEX_BASE + y_start;
+ long maxj = indx_p_k[i*owidth + j] - TH_INDEX_BASE + x_start;
+
+ /* update gradient */
+ gradInput_p_k[maxi*iwidth + maxj] += gradOutput_p_k[i*owidth + j];
+ }
+ }
+ }
+}
+
+void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THIndexTensor *indices)
+{
+ int dimw = 2;
+ int dimh = 1;
+ long nbatch = 1;
+ int nslices;
+ int iheight;
+ int iwidth;
+ int oheight;
+ int owidth;
+ real *gradInput_data;
+ real *gradOutput_data;
+ THIndex_t *indices_data;
+
+ /* get contiguous gradOutput */
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+
+ /* resize */
+ THTensor_(resizeAs)(gradInput, input);
+ THTensor_(zero)(gradInput);
+
+ if (input->nDimension == 4) {
+ nbatch = input->size[0];
+ dimw++;
+ dimh++;
+ }
+
+ /* sizes */
+ nslices = input->size[dimh-1];
+ iheight = input->size[dimh];
+ iwidth = input->size[dimw];
+ oheight = gradOutput->size[dimh];
+ owidth = gradOutput->size[dimw];
+
+ /* get raw pointers */
+ gradInput_data = THTensor_(data)(gradInput);
+ gradOutput_data = THTensor_(data)(gradOutput);
+ indices_data = THIndexTensor_(data)(indices);
+
+ /* backprop */
+ if (input->nDimension == 3)
+ {
+ THNN_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(gradInput_data, gradOutput_data,
+ indices_data+nslices*owidth*oheight, indices_data,
+ nslices,
+ iwidth, iheight,
+ owidth, oheight);
+ }
+ else
+ {
+ long p;
+#pragma omp parallel for private(p)
+ for (p = 0; p < nbatch; p++)
+ {
+ THNN_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(gradInput_data+p*nslices*iwidth*iheight, gradOutput_data+p*nslices*owidth*oheight,
+ indices_data+(p+nbatch)*nslices*owidth*oheight, indices_data+p*nslices*owidth*oheight,
+ nslices,
+ iwidth, iheight,
+ owidth, oheight);
+ }
+ }
+
+ /* cleanup */
+ THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialAveragePooling.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialAveragePooling.c
new file mode 100644
index 000000000..c063502e7
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialAveragePooling.c
@@ -0,0 +1,329 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialAveragePooling.c"
+#else
+
+static inline void THNN_(SpatialAveragePooling_shapeCheck)(
+ THTensor *input, THTensor *gradOutput,
+ int kH, int kW, int dH, int dW, int padH, int padW,
+ bool ceil_mode) {
+
+ THArgCheck(kW > 0 && kH > 0, 5,
+ "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+ THArgCheck(dW > 0 && dH > 0, 8,
+ "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+
+ int ndim = input->nDimension;
+ int dimf = 0;
+ int dimh = 1;
+ int dimw = 2;
+
+ if (ndim == 4) {
+ dimf++;
+ dimh++;
+ dimw++;
+ }
+
+ THNN_ARGCHECK(ndim == 3 || ndim == 4, 2, input,
+ "3D or 4D input tensor expected but got: %s");
+
+ THArgCheck(kW/2 >= padW && kH/2 >= padH, 2,
+ "pad should be smaller than half of kernel size, but got "
+ "padW = %d, padH = %d, kW = %d, kH = %d",
+ padW, padH, kW, kH);
+
+ long nInputPlane = input->size[dimh-1];
+ long inputHeight = input->size[dimh];
+ long inputWidth = input->size[dimw];
+ long outputHeight, outputWidth;
+ long nOutputPlane = nInputPlane;
+
+ if(ceil_mode)
+ {
+ outputHeight = (long)(ceil((float)(inputHeight - kH + 2*padH) / dH)) + 1;
+ outputWidth = (long)(ceil((float)(inputWidth - kW + 2*padW) / dW)) + 1;
+ }
+ else
+ {
+ outputHeight = (long)(floor((float)(inputHeight - kH + 2*padH) / dH)) + 1;
+ outputWidth = (long)(floor((float)(inputWidth - kW + 2*padW) / dW)) + 1;
+ }
+
+ if (padW || padH)
+ {
+ // ensure that the last pooling starts inside the image
+ // needed to avoid problems in ceil mode
+ if ((outputHeight - 1)*dH >= inputHeight + padH)
+ --outputHeight;
+ if ((outputWidth - 1)*dW >= inputWidth + padW)
+ --outputWidth;
+ }
+
+ if (outputWidth < 1 || outputHeight < 1)
+ THError("Given input size: (%dx%dx%d). "
+ "Calculated output size: (%dx%dx%d). Output size is too small",
+ nInputPlane,inputHeight,inputWidth,nInputPlane,outputHeight,outputWidth);
+
+ if (gradOutput != NULL) {
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+ }
+}
+
+void THNN_(SpatialAveragePooling_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ int kW,
+ int kH,
+ int dW,
+ int dH,
+ int padW,
+ int padH,
+ bool ceil_mode,
+ bool count_include_pad)
+{
+ real *output_data;
+ real *input_data;
+
+ int dimw = 2;
+ int dimh = 1;
+ int dimc = 0;
+ long nbatch = 1;
+
+ long inputWidth;
+ long inputHeight;
+ long outputWidth;
+ long outputHeight;
+ long nInputPlane; // number of channels (or colors)
+
+ long k;
+
+ THNN_(SpatialAveragePooling_shapeCheck)
+ (input, NULL, kH, kW, dH, dW, padH, padW, ceil_mode);
+
+ if (input->nDimension == 4) {
+ nbatch = input->size[0];
+ dimw++;
+ dimh++;
+ dimc++;
+ }
+
+ inputWidth = input->size[dimw];
+ inputHeight = input->size[dimh];
+ nInputPlane = input->size[dimc];
+
+ if(ceil_mode)
+ {
+ outputWidth = (long)(ceil((float)(inputWidth - kW + 2*padW) / dW)) + 1;
+ outputHeight = (long)(ceil((float)(inputHeight - kH + 2*padH) / dH)) + 1;
+ }
+ else
+ {
+ outputWidth = (long)(floor((float)(inputWidth - kW + 2*padW) / dW)) + 1;
+ outputHeight = (long)(floor((float)(inputHeight - kH + 2*padH) / dH)) + 1;
+ }
+ if (padW || padH)
+ {
+ // ensure that the last pooling starts inside the image
+ // needed to avoid problems in ceil mode
+ if ((outputHeight - 1)*dH >= inputHeight + padH)
+ --outputHeight;
+ if ((outputWidth - 1)*dW >= inputWidth + padW)
+ --outputWidth;
+ }
+
+ if (input->nDimension == 3)
+ THTensor_(resize3d)(output, nInputPlane, outputHeight, outputWidth);
+ else
+ THTensor_(resize4d)(output, input->size[0], nInputPlane, outputHeight, outputWidth);
+
+ input = THTensor_(newContiguous)(input);
+ THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous");
+ input_data = THTensor_(data)(input);
+ output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(k)
+ for(k = 0; k < nInputPlane; k++)
+ {
+ long p;
+ for(p = 0; p < nbatch; p++)
+ {
+ long xx, yy;
+ /* For all output pixels... */
+ real *ptr_output = output_data + p*nInputPlane*outputWidth*outputHeight + k*outputWidth*outputHeight;
+ real *ptr_input = input_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight;
+ long i;
+ for(i = 0; i < outputWidth*outputHeight; i++)
+ ptr_output[i] = 0;
+
+ for(yy = 0; yy < outputHeight; yy++)
+ {
+ for(xx = 0; xx < outputWidth; xx++)
+ {
+ /* Compute the mean of the input image... */
+ long hstart = yy * dH - padH;
+ long wstart = xx * dW - padW;
+ long hend = fminf(hstart + kH, inputHeight + padH);
+ long wend = fminf(wstart + kW, inputWidth + padW);
+ int pool_size = (hend - hstart) * (wend - wstart);
+ hstart = fmaxf(hstart, 0);
+ wstart = fmaxf(wstart, 0);
+ hend = fminf(hend, inputHeight);
+ wend = fminf(wend, inputWidth);
+
+ real sum = 0;
+
+ int divide_factor;
+ if(count_include_pad)
+ divide_factor = pool_size;
+ else
+ divide_factor = (hend - hstart) * (wend - wstart);
+
+ long kx, ky;
+
+ for(ky = hstart; ky < hend; ky++)
+ {
+ for(kx = wstart; kx < wend; kx++)
+ sum += ptr_input[ky*inputWidth + kx];
+ }
+ /* Update output */
+ *ptr_output++ += sum/divide_factor;
+ }
+ }
+ }
+ }
+ THTensor_(free)(input);
+}
+
+void THNN_(SpatialAveragePooling_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ int kW,
+ int kH,
+ int dW,
+ int dH,
+ int padW,
+ int padH,
+ bool ceil_mode,
+ bool count_include_pad)
+{
+ int dimw = 2;
+ int dimh = 1;
+ int dimc = 0;
+ long nbatch = 1;
+ long ndim = 3;
+
+ long inputWidth;
+ long inputHeight;
+ long outputWidth;
+ long outputHeight;
+ long nInputPlane; // number of channels (or colors)
+
+ real *gradOutput_data;
+ real *input_data, *gradInput_data;
+
+ long k;
+
+ THNN_(SpatialAveragePooling_shapeCheck)
+ (input, gradOutput, kH, kW, dH, dW, padH, padW, ceil_mode);
+
+
+ if (input->nDimension == 4) {
+ nbatch = input->size[0];
+ dimw++;
+ dimh++;
+ dimc++;
+ ndim = 4;
+ }
+
+ inputWidth = input->size[dimw];
+ inputHeight = input->size[dimh];
+ nInputPlane = input->size[dimc];
+
+ if(ceil_mode)
+ {
+ outputWidth = (long)(ceil((float)(inputWidth - kW + 2*padW) / dW)) + 1;
+ outputHeight = (long)(ceil((float)(inputHeight - kH + 2*padH) / dH)) + 1;
+ }
+ else
+ {
+ outputWidth = (long)(floor((float)(inputWidth - kW + 2*padW) / dW)) + 1;
+ outputHeight = (long)(floor((float)(inputHeight - kH + 2*padH) / dH)) + 1;
+ }
+ if (padW || padH)
+ {
+ // ensure that the last pooling starts inside the image
+ // needed to avoid problems in ceil mode
+ if ((outputHeight - 1)*dH >= inputHeight + padH)
+ --outputHeight;
+ if ((outputWidth - 1)*dW >= inputWidth + padW)
+ --outputWidth;
+ }
+
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+
+ THTensor_(resizeAs)(gradInput, input);
+
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+ THArgCheck(THTensor_(isContiguous)(gradInput), 4, "gradInput must be contiguous");
+
+ gradInput_data = THTensor_(data)(gradInput);
+ gradOutput_data = THTensor_(data)(gradOutput);
+
+#pragma omp parallel for private(k)
+ for(k = 0; k < nInputPlane; k++)
+ {
+ long p;
+ for(p = 0; p < nbatch; p++)
+ {
+ real *ptr_gradOutput = gradOutput_data + p*nInputPlane*outputHeight*outputWidth + k*outputWidth*outputHeight;
+ long xx, yy;
+
+ real* ptr_gi = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight;
+ real *ptr_gradInput = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight;
+
+ long i;
+ for(i=0; i<inputWidth*inputHeight; i++)
+ ptr_gi[i] = 0.0;
+
+ for(yy = 0; yy < outputHeight; yy++)
+ {
+ for(xx = 0; xx < outputWidth; xx++)
+ {
+ long hstart = yy * dH - padH;
+ long wstart = xx * dW - padW;
+ long hend = fminf(hstart + kH, inputHeight + padH);
+ long wend = fminf(wstart + kW, inputWidth + padW);
+ int pool_size = (hend - hstart) * (wend - wstart);
+ hstart = fmaxf(hstart, 0);
+ wstart = fmaxf(wstart, 0);
+ hend = fminf(hend, inputHeight);
+ wend = fminf(wend, inputWidth);
+
+ real z = *ptr_gradOutput++;
+
+ int divide_factor;
+ if(count_include_pad)
+ divide_factor = pool_size;
+ else
+ divide_factor = (hend - hstart) * (wend - wstart);
+
+ long kx, ky;
+ for(ky = hstart ; ky < hend; ky++)
+ {
+ for(kx = wstart; kx < wend; kx++)
+ ptr_gradInput[ky*inputWidth + kx] += z/divide_factor;
+ }
+ }
+ }
+ }
+ }
+
+ THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialClassNLLCriterion.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialClassNLLCriterion.c
new file mode 100644
index 000000000..d711c8590
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialClassNLLCriterion.c
@@ -0,0 +1,131 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialClassNLLCriterion.c"
+#else
+
+#define INITIAL_CHECK \
+ THArgCheck(THIndexTensor_(nDimension)(target) == 3, 3, \
+ "only batches of spatial targets supported (3D tensors)" \
+ " but got targets of dimension: %d", \
+ THIndexTensor_(nDimension)(target)); \
+ THArgCheck(THTensor_(nDimension)(input) == 4, 2, \
+ "only batches of spatial inputs supported (4D tensors), " \
+ "but got input of dimension: %d", THTensor_(nDimension)(input)); \
+ if (weights && THTensor_(nElement)(weights) != THTensor_(size)(input, 1)) { \
+ THError("weight tensor should be defined either for all or no classes"); \
+ } \
+ \
+ { \
+ long input0 = THTensor_(size)(input, 0); \
+ long input1 = THTensor_(size)(input, 1); \
+ long input2 = THTensor_(size)(input, 2); \
+ long input3 = THTensor_(size)(input, 3); \
+ long target0 = THIndexTensor_(size)(target, 0); \
+ long target1 = THIndexTensor_(size)(target, 1); \
+ long target2 = THIndexTensor_(size)(target, 2); \
+ THAssertMsg(input0 == target0 && input2 == target1 && input3 == target2, \
+ "size mismatch (got input: %ldx%ldx%ldx%ld, target: %ldx%ldx%ld)", \
+ input0, input1, input2, input3, target0, target1, target2); \
+ }
+
+void THNN_(SpatialClassNLLCriterion_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THIndexTensor *target,
+ THTensor *output,
+ bool sizeAverage,
+ THTensor *weights,
+ THTensor *total_weight)
+{
+ INITIAL_CHECK;
+
+ input = THTensor_(newContiguous)(input);
+ target = THIndexTensor_(newContiguous)(target);
+ weights = weights ? THTensor_(newContiguous)(weights) : NULL;
+
+ real *input_data = THTensor_(data)(input);
+ THIndex_t *target_data = THIndexTensor_(data)(target);
+ real *weights_data = weights ? THTensor_(data)(weights) : NULL;
+ real *output_data = THTensor_(data)(output);
+ real *total_weight_data = THTensor_(data)(total_weight);
+
+ long batch_size = THTensor_(size)(input, 0);
+ long n_classes = THTensor_(size)(input, 1);
+ long map_size = THTensor_(size)(input, 2) * THTensor_(size)(input, 3);
+ long sample_size = map_size * n_classes;
+
+ real total_weight_acc = 0;
+ real output_acc = 0;
+ for (int b = 0; b < batch_size; b++) {
+ for (int elem = 0; elem < map_size; elem++) {
+ int cur_target = target_data[b * map_size + elem] - TH_INDEX_BASE;
+ THAssert(cur_target >= 0 && cur_target < n_classes);
+
+ real cur_weight = weights ? weights_data[cur_target] : 1.0f;
+ total_weight_acc += cur_weight;
+ output_acc -= input_data[b * sample_size + cur_target * map_size + elem] * cur_weight;
+ }
+ }
+ *total_weight_data = total_weight_acc;
+ *output_data = output_acc;
+
+ if (sizeAverage && *total_weight_data)
+ *output_data /= *total_weight_data;
+
+ THTensor_(free)(input);
+ THIndexTensor_(free)(target);
+ if (weights)
+ THTensor_(free)(weights);
+}
+
+void THNN_(SpatialClassNLLCriterion_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THIndexTensor *target,
+ THTensor *gradInput,
+ bool sizeAverage,
+ THTensor *weights,
+ THTensor *total_weight)
+{
+ INITIAL_CHECK;
+ THArgCheck(THTensor_(isContiguous)(gradInput), 4,
+ "gradInput must be contiguous");
+
+ real *total_weight_data = THTensor_(data)(total_weight);
+ if (*total_weight_data <= 0)
+ return;
+
+ target = THIndexTensor_(newContiguous)(target);
+ weights = weights ? THTensor_(newContiguous)(weights) : NULL;
+
+ THIndex_t *target_data = THIndexTensor_(data)(target);
+ real *weights_data = weights ? THTensor_(data)(weights) : NULL;
+ real *gradInput_data = THTensor_(data)(gradInput);
+
+ long batch_size = THTensor_(size)(input, 0);
+ long n_classes = THTensor_(size)(input, 1);
+ long map_size = THTensor_(size)(input, 2) * THTensor_(size)(input, 3);
+ long sample_size = map_size * n_classes;
+
+ real normalize = sizeAverage ? *total_weight_data : 1.0f;
+
+ int b;
+ #pragma omp parallel for
+ for (b = 0; b < batch_size; b++) {
+ int elem;
+ for (elem = 0; elem < map_size; elem++) {
+ int cur_target = target_data[b * map_size + elem] - TH_INDEX_BASE;
+ THAssert(cur_target >= 0 && cur_target < n_classes);
+
+ gradInput_data[b * sample_size + cur_target * map_size + elem] =
+ -(weights ? weights_data[cur_target] : 1.0f) / normalize;
+ }
+ }
+
+ THIndexTensor_(free)(target);
+ if (weights)
+ THTensor_(free)(weights);
+}
+
+#undef INITIAL_CHECK
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialConvolutionLocal.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialConvolutionLocal.c
new file mode 100644
index 000000000..6db5a5db9
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialConvolutionLocal.c
@@ -0,0 +1,367 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialConvolutionLocal.c"
+#else
+
+static inline void THNN_(SpatialConvolutionLocal_shapeCheck)(
+ THTensor *input, THTensor *gradOutput,
+ THTensor *weight, THTensor *bias,
+ int kH, int kW, int dH,
+ int dW, int padH, int padW,
+ long inputHeight, long inputWidth,
+ long outputHeight, long outputWidth) {
+
+ THArgCheck(kW > 0 && kH > 0, 9,
+ "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+ THArgCheck(dW > 0 && dH > 0, 11,
+ "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+
+ int ndim = input->nDimension;
+ int dimf = 0;
+ int dimh = 1;
+ int dimw = 2;
+
+ if (ndim == 4) {
+ dimf++;
+ dimh++;
+ dimw++;
+ }
+
+ THNN_ARGCHECK(ndim == 3 || ndim == 4, 2, input,
+ "3D or 4D input tensor expected but got: %s");
+
+ long nInputPlane = weight->size[2] / (kH * kW);
+ long nOutputPlane = weight->size[1];
+
+ if (bias != NULL) {
+ THNN_CHECK_DIM_SIZE(bias, 3, 0, nOutputPlane);
+ THNN_CHECK_DIM_SIZE(bias, 3, 1, outputHeight);
+ THNN_CHECK_DIM_SIZE(bias, 3, 2, outputWidth);
+ }
+
+ THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
+
+ if (gradOutput != NULL) {
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+ }
+}
+
+static THTensor* THNN_(view_weight_local)(THTensor *_weight)
+{
+ THTensor *weight = THTensor_(newContiguous)(_weight);
+ THArgCheck(weight->nDimension == 3 || weight->nDimension == 6, 4,
+ "weight tensor should be 3D or 6D - got %dD", weight->nDimension);
+ if (weight->nDimension == 6) {
+ long s1 = weight->size[0] * weight->size[1];
+ long s2 = weight->size[2];
+ long s3 = weight->size[3] * weight->size[4] * weight->size[5];
+ THTensor *old_weight = weight;
+ weight = THTensor_(newWithStorage3d)(weight->storage,
+ weight->storageOffset,
+ s1, -1, s2, -1, s3, -1);
+ THTensor_(free)(old_weight);
+ }
+ return weight;
+}
+
+static void THNN_(SpatialConvolutionLocal_updateOutput_frame)
+ (
+ THTensor *input, THTensor *output,
+ THTensor *weight, THTensor *bias, THTensor *finput,
+ int kW, int kH, int dW, int dH, int padW, int padH,
+ long nInputPlane, long inputWidth, long inputHeight,
+ long nOutputPlane, long outputWidth, long outputHeight)
+{
+ long i;
+ THTensor *output3d, *finput3d;
+
+ THNN_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH,
+ nInputPlane, inputWidth, inputHeight,
+ outputWidth, outputHeight);
+
+ THTensor_(copy)(output, bias);
+
+ output3d = THTensor_(newWithStorage3d)
+ (output->storage, output->storageOffset,
+ outputHeight * outputWidth, 1,
+ nOutputPlane, outputHeight * outputWidth,
+ 1, nOutputPlane * outputHeight * outputWidth);
+
+ finput3d = THTensor_(newWithStorage3d)
+ (finput->storage, finput->storageOffset,
+ outputHeight * outputWidth, 1,
+ kW * kH * nInputPlane, outputHeight * outputWidth,
+ 1, kW * kH * nInputPlane * outputHeight * outputWidth);
+
+ // weight: oH*oW x nOutputPlane x nInputPlane*kH*kW
+ // finput3d: oH*oW x nInputPlane*kH*kW x 1
+ THTensor_(baddbmm)(output3d, 1.0, output3d, 1.0, weight, finput3d);
+ // output3d: oH*oW x nOutputPlane x 1
+
+ THTensor_(free)(output3d);
+ THTensor_(free)(finput3d);
+}
+
+void THNN_(SpatialConvolutionLocal_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THTensor *weight,
+ THTensor *bias,
+ THTensor *finput,
+ THTensor *fgradInput,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ long inputWidth, long inputHeight,
+ long outputWidth, long outputHeight)
+{
+ weight = THNN_(view_weight_local)(weight);
+
+ THNN_(SpatialConvolutionLocal_shapeCheck)
+ (input, NULL, weight, bias, kH, kW, dH, dW, padH, padW,
+ inputHeight, inputWidth, outputHeight, outputWidth);
+
+ input = THTensor_(newContiguous)(input);
+
+ long nInputPlane = THTensor_(size)(weight, 2)/ (kW * kH);
+ long nOutputPlane = THTensor_(size)(weight, 1);
+
+ if(input->nDimension == 3)
+ {
+ THTensor_(resize2d)(finput, kW*kH*nInputPlane, outputHeight*outputWidth);
+ THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
+
+ THNN_(SpatialConvolutionLocal_updateOutput_frame)
+ (input, output, weight, bias, finput,
+ kW, kH, dW, dH, padW, padH,
+ nInputPlane, inputWidth, inputHeight,
+ nOutputPlane, outputWidth, outputHeight);
+ }
+ else
+ {
+ long T = input->size[0];
+ long t;
+
+ THTensor_(resize3d)(finput, T, kW*kH*nInputPlane, outputHeight*outputWidth);
+ THTensor_(resize4d)(output, T, nOutputPlane, outputHeight, outputWidth);
+
+#pragma omp parallel for private(t)
+ for(t = 0; t < T; t++)
+ {
+ THTensor *input_t = THTensor_(newSelect)(input, 0, t);
+ THTensor *output_t = THTensor_(newSelect)(output, 0, t);
+ THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+ THNN_(SpatialConvolutionLocal_updateOutput_frame)
+ (input_t, output_t, weight, bias, finput_t,
+ kW, kH, dW, dH, padW, padH,
+ nInputPlane, inputWidth, inputHeight,
+ nOutputPlane, outputWidth, outputHeight);
+
+ THTensor_(free)(input_t);
+ THTensor_(free)(output_t);
+ THTensor_(free)(finput_t);
+ }
+ }
+
+ THTensor_(free)(input);
+ THTensor_(free)(weight);
+}
+
+
+static void THNN_(SpatialConvolutionLocal_updateGradInput_frame)
+ (THTensor *gradInput, THTensor *gradOutput,
+ THTensor *weight, THTensor *fgradInput,
+ int kW, int kH, int dW, int dH, int padW, int padH,
+ long nInputPlane, long inputWidth, long inputHeight,
+ long nOutputPlane, long outputWidth, long outputHeight)
+{
+ THTensor *gradOutput3d, *fgradInput3d;
+ gradOutput3d = THTensor_(newWithStorage3d)(gradOutput->storage, gradOutput->storageOffset,
+ outputHeight*outputWidth, 1,
+ nOutputPlane, outputHeight*outputWidth,
+ 1, nOutputPlane*outputHeight*outputWidth);
+ fgradInput3d = THTensor_(newWithStorage3d)(fgradInput->storage, fgradInput->storageOffset,
+ outputHeight*outputWidth, 1,
+ kW*kH*nInputPlane, outputHeight*outputWidth,
+ 1, kW*kH*nInputPlane*outputHeight*outputWidth);
+ // weight: oH*oW x nInputPlane*kH*kW x nOutputPlane
+ // gradOutput3d: oH*oW x nOutputPlane x 1
+ THTensor_(baddbmm)(fgradInput3d, 0.0, fgradInput3d, 1.0, weight, gradOutput3d);
+ // fgradInput3d: oH*oW x nInputPlane*kH*kW x 1
+
+ THTensor_(free)(gradOutput3d);
+ THTensor_(free)(fgradInput3d);
+
+ THTensor_(zero)(gradInput);
+
+ THNN_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH, padW, padH,
+ nInputPlane, inputWidth, inputHeight,
+ outputWidth, outputHeight);
+
+}
+
+void THNN_(SpatialConvolutionLocal_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *weight,
+ THTensor *finput,
+ THTensor *fgradInput,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ long inputWidth, long inputHeight,
+ long outputWidth, long outputHeight)
+{
+ weight = THNN_(view_weight_local)(weight);
+
+ THNN_(SpatialConvolutionLocal_shapeCheck)
+ (input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW,
+ inputHeight, inputWidth, outputHeight, outputWidth);
+
+ input = THTensor_(newContiguous)(input);
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+ long nInputPlane = THTensor_(size)(weight,2)/(kW*kH);
+ long nOutputPlane = THTensor_(size)(weight,1);
+
+ THTensor_(resizeAs)(gradInput, input);
+ THTensor_(resizeAs)(fgradInput, finput);
+
+ THTensor *tweight = THTensor_(new)();
+ THTensor_(transpose)(tweight, weight, 1, 2);
+
+ if(input->nDimension == 3)
+ {
+ THNN_(SpatialConvolutionLocal_updateGradInput_frame)
+ (gradInput, gradOutput, tweight,
+ fgradInput, kW, kH, dW, dH, padW, padH,
+ nInputPlane, inputWidth, inputHeight,
+ nOutputPlane, outputWidth, outputHeight);
+ }
+ else
+ {
+ long T = input->size[0];
+ long t;
+
+#pragma omp parallel for private(t)
+ for(t = 0; t < T; t++)
+ {
+ THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
+ THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+ THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
+
+ THNN_(SpatialConvolutionLocal_updateGradInput_frame)
+ (gradInput_t, gradOutput_t, tweight, fgradInput_t,
+ kW, kH, dW, dH, padW, padH,
+ nInputPlane, inputWidth, inputHeight,
+ nOutputPlane, outputWidth, outputHeight);
+
+ THTensor_(free)(gradInput_t);
+ THTensor_(free)(gradOutput_t);
+ THTensor_(free)(fgradInput_t);
+ }
+ }
+
+ THTensor_(free)(tweight);
+ THTensor_(free)(input);
+ THTensor_(free)(gradOutput);
+ THTensor_(free)(weight);
+}
+
+static void THNN_(SpatialConvolutionLocal_accGradParameters_frame)
+ (THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias,
+ THTensor *finput, real scale,
+ int kW, int kH, int dW, int dH, int padW, int padH,
+ long nInputPlane, long inputWidth, long inputHeight,
+ long nOutputPlane, long outputWidth, long outputHeight)
+{
+
+ THTensor *gradOutput3d, *finput3d;
+ gradOutput3d = THTensor_(newWithStorage3d)(gradOutput->storage, gradOutput->storageOffset,
+ outputHeight*outputWidth, 1,
+ nOutputPlane, outputHeight*outputWidth,
+ 1, nOutputPlane*outputHeight*outputWidth);
+ finput3d = THTensor_(newWithStorage3d)(finput->storage, finput->storageOffset,
+ outputHeight*outputWidth, 1,
+ 1, kW*kH*nInputPlane*outputHeight*outputWidth,
+ kW*kH*nInputPlane, outputHeight*outputWidth);
+ // gradOutput3d: oH*oW x nOutputPlane x 1
+ // finput3d: oH*oW x 1 x kW*kH*nInputPlane
+ THTensor_(baddbmm)(gradWeight, 1.0, gradWeight, scale, gradOutput3d, finput3d);
+ // gradWeight: oH*oW x nOutputPlane x kW*kH*nInputPlane
+
+ THTensor_(cadd)(gradBias, gradBias, scale, gradOutput);
+
+ THTensor_(free)(gradOutput3d);
+ THTensor_(free)(finput3d);
+}
+
+void THNN_(SpatialConvolutionLocal_accGradParameters)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradWeight,
+ THTensor *gradBias,
+ THTensor *finput,
+ THTensor *fgradInput,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ long inputWidth, long inputHeight,
+ long outputWidth, long outputHeight,
+ accreal scale_)
+{
+ THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous");
+ THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous");
+ real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+ gradWeight = THNN_(view_weight_local)(gradWeight);
+
+ THNN_(SpatialConvolutionLocal_shapeCheck)
+ (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW,
+ inputHeight, inputWidth, outputHeight, outputWidth);
+
+ input = THTensor_(newContiguous)(input);
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+
+ long nInputPlane = THTensor_(size)(gradWeight,2)/(kW*kH);
+ long nOutputPlane = THTensor_(size)(gradWeight,1);
+
+ if(input->nDimension == 3)
+ {
+ THNN_(SpatialConvolutionLocal_accGradParameters_frame)
+ (gradOutput, gradWeight, gradBias, finput, scale,
+ kW, kH, dW, dH, padW, padH,
+ nInputPlane, inputWidth, inputHeight,
+ nOutputPlane, outputWidth, outputHeight);
+ }
+ else
+ {
+ long T = input->size[0];
+ long t;
+
+ for(t = 0; t < T; t++)
+ {
+ THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+ THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+ THNN_(SpatialConvolutionLocal_accGradParameters_frame)
+ (gradOutput_t, gradWeight, gradBias, finput_t, scale,
+ kW, kH, dW, dH, padW, padH,
+ nInputPlane, inputWidth, inputHeight,
+ nOutputPlane, outputWidth, outputHeight);
+
+ THTensor_(free)(gradOutput_t);
+ THTensor_(free)(finput_t);
+ }
+ }
+
+ THTensor_(free)(input);
+ THTensor_(free)(gradOutput);
+ THTensor_(free)(gradWeight);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialConvolutionMM.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialConvolutionMM.c
new file mode 100644
index 000000000..28fea517c
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialConvolutionMM.c
@@ -0,0 +1,377 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialConvolutionMM.c"
+#else
+
+static inline void THNN_(SpatialConvolutionMM_shapeCheck)(
+ THTensor *input, THTensor *gradOutput,
+ THTensor *weight, THTensor *bias,
+ int kH, int kW, int dH, int dW, int padH, int padW) {
+
+ THArgCheck(kW > 0 && kH > 0, 9,
+ "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+ THArgCheck(dW > 0 && dH > 0, 11,
+ "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+ THNN_ARGCHECK(weight->nDimension == 2 || weight->nDimension == 4, 5, weight,
+ "2D or 4D weight tensor expected, but got: %s");
+
+ if (bias != NULL) {
+ THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]);
+ }
+
+ int ndim = input->nDimension;
+ int dimf = 0;
+ int dimh = 1;
+ int dimw = 2;
+
+ if (ndim == 4) {
+ dimf++;
+ dimh++;
+ dimw++;
+ }
+
+ THNN_ARGCHECK(ndim == 3 || ndim == 4, 2, input,
+ "3D or 4D input tensor expected but got: %s");
+
+ long nInputPlane = weight->size[1] / (kH * kW);
+ long inputHeight = input->size[dimh];
+ long inputWidth = input->size[dimw];
+ long nOutputPlane = weight->size[0];
+ long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+ long outputWidth = (inputWidth + 2*padW - kW) / dW + 1;
+
+ if (outputWidth < 1 || outputHeight < 1)
+ THError("Given input size: (%d x %d x %d). "
+ "Calculated output size: (%d x %d x %d). Output size is too small",
+ nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth);
+
+ THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
+
+ if (gradOutput != NULL) {
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+ }
+}
+
+static THTensor* THNN_(view_weight_MM2d)(THTensor *weight) {
+ weight = THTensor_(newContiguous)(weight);
+ if (weight->nDimension == 4) {
+ long s1 = weight->size[0];
+ long s2 = weight->size[1] * weight->size[2] * weight->size[3];
+ THTensor *old_weight = weight;
+ weight = THTensor_(newWithStorage2d)(weight->storage, weight->storageOffset,
+ s1, -1, s2, -1);
+ THTensor_(free)(old_weight);
+ }
+ return weight;
+}
+
+static void THNN_(SpatialConvolutionMM_updateOutput_frame)(
+ THTensor *input,
+ THTensor *output,
+ THTensor *weight,
+ THTensor *bias,
+ THTensor *finput,
+ int kW,
+ int kH,
+ int dW,
+ int dH,
+ int padW,
+ int padH,
+ long nInputPlane,
+ long inputWidth,
+ long inputHeight,
+ long nOutputPlane,
+ long outputWidth,
+ long outputHeight)
+{
+ long i;
+ THTensor *output2d;
+
+ THNN_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH,
+ nInputPlane, inputWidth, inputHeight,
+ outputWidth, outputHeight);
+
+ output2d = THTensor_(newWithStorage2d)(output->storage, output->storageOffset,
+ nOutputPlane, -1,
+ outputHeight*outputWidth, -1);
+ if (bias) {
+ for(i = 0; i < nOutputPlane; i++)
+ THVector_(fill)
+ (output->storage->data + output->storageOffset + output->stride[0] * i,
+ THTensor_(get1d)(bias, i), outputHeight*outputWidth);
+ } else {
+ THTensor_(zero)(output);
+ }
+
+ THTensor_(addmm)(output2d, 1, output2d, 1, weight, finput);
+
+ THTensor_(free)(output2d);
+}
+
+void THNN_(SpatialConvolutionMM_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THTensor *weight,
+ THTensor *bias,
+ THTensor *finput,
+ THTensor *fgradInput,
+ int kW,
+ int kH,
+ int dW,
+ int dH,
+ int padW,
+ int padH)
+{
+ weight = THNN_(view_weight_MM2d)(weight);
+
+ THNN_(SpatialConvolutionMM_shapeCheck)
+ (input, NULL, weight, bias, kH, kW, dH, dW, padH, padW);
+
+ input = THTensor_(newContiguous)(input);
+ int ndim = input->nDimension;
+ int dimf = 0;
+ int dimh = 1;
+ int dimw = 2;
+
+ if (ndim == 4) {
+ dimf++;
+ dimh++;
+ dimw++;
+ }
+
+ long nInputPlane = input->size[dimf];
+ long inputHeight = input->size[dimh];
+ long inputWidth = input->size[dimw];
+ long nOutputPlane = weight->size[0];
+ long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+ long outputWidth = (inputWidth + 2*padW - kW) / dW + 1;
+
+ if(input->nDimension == 3)
+ {
+ THTensor_(resize2d)(finput, kW*kH*nInputPlane, outputHeight*outputWidth);
+ THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
+
+ THNN_(SpatialConvolutionMM_updateOutput_frame)
+ (input, output, weight, bias, finput,
+ kW, kH, dW, dH, padW, padH,
+ nInputPlane, inputWidth, inputHeight,
+ nOutputPlane, outputWidth, outputHeight);
+ }
+ else
+ {
+ long T = input->size[0];
+ long t;
+
+ THTensor_(resize3d)(finput, T, kW*kH*nInputPlane, outputHeight*outputWidth);
+ THTensor_(resize4d)(output, T, nOutputPlane, outputHeight, outputWidth);
+
+#pragma omp parallel for private(t)
+ for(t = 0; t < T; t++)
+ {
+ THTensor *input_t = THTensor_(newSelect)(input, 0, t);
+ THTensor *output_t = THTensor_(newSelect)(output, 0, t);
+ THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+ THNN_(SpatialConvolutionMM_updateOutput_frame)
+ (input_t, output_t, weight, bias, finput_t,
+ kW, kH, dW, dH, padW, padH,
+ nInputPlane, inputWidth, inputHeight,
+ nOutputPlane, outputWidth, outputHeight);
+
+ THTensor_(free)(input_t);
+ THTensor_(free)(output_t);
+ THTensor_(free)(finput_t);
+ }
+ }
+
+ THTensor_(free)(input);
+ THTensor_(free)(weight);
+}
+
+static void THNN_(SpatialConvolutionMM_updateGradInput_frame)(
+ THTensor *gradInput,
+ THTensor *gradOutput,
+ THTensor *weight,
+ THTensor *fgradInput,
+ int kW,
+ int kH,
+ int dW,
+ int dH,
+ int padW,
+ int padH)
+{
+ THTensor *gradOutput2d = THTensor_(newWithStorage2d)
+ (gradOutput->storage, gradOutput->storageOffset,
+ gradOutput->size[0], -1,
+ gradOutput->size[1]*gradOutput->size[2], -1);
+ THTensor_(addmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput2d);
+ THTensor_(free)(gradOutput2d);
+
+ THTensor_(zero)(gradInput);
+
+ THNN_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH,
+ padW, padH,
+ gradInput->size[0], gradInput->size[2], gradInput->size[1],
+ gradOutput->size[2], gradOutput->size[1]);
+}
+
+void THNN_(SpatialConvolutionMM_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *weight,
+ THTensor *finput,
+ THTensor *fgradInput,
+ int kW,
+ int kH,
+ int dW,
+ int dH,
+ int padW,
+ int padH)
+{
+ weight = THNN_(view_weight_MM2d)(weight);
+
+ THNN_(SpatialConvolutionMM_shapeCheck)
+ (input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW);
+
+ input = THTensor_(newContiguous)(input);
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+
+ THTensor_(resizeAs)(gradInput, input);
+ THTensor_(resizeAs)(fgradInput, finput);
+
+ // depending on the BLAS library, fgradInput (result tensor) might
+ // be left uninitialized on zero alpha, which might lead to weird behavior
+ // hence, to be safe, zero it
+ THTensor_(zero)(fgradInput);
+ THTensor *tweight = THTensor_(new)();
+ THTensor_(transpose)(tweight, weight, 0, 1);
+
+ if(input->nDimension == 3)
+ {
+ THNN_(SpatialConvolutionMM_updateGradInput_frame)(gradInput, gradOutput,
+ tweight, fgradInput,
+ kW, kH, dW, dH, padW, padH);
+ }
+ else
+ {
+ long T = input->size[0];
+ long t;
+
+#pragma omp parallel for private(t)
+ for(t = 0; t < T; t++)
+ {
+ THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
+ THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+ THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
+
+ THNN_(SpatialConvolutionMM_updateGradInput_frame)(gradInput_t, gradOutput_t,
+ tweight, fgradInput_t,
+ kW, kH, dW, dH, padW, padH);
+
+ THTensor_(free)(gradInput_t);
+ THTensor_(free)(gradOutput_t);
+ THTensor_(free)(fgradInput_t);
+ }
+ }
+
+ THTensor_(free)(tweight);
+ THTensor_(free)(input);
+ THTensor_(free)(gradOutput);
+ THTensor_(free)(weight);
+}
+
+static void THNN_(SpatialConvolutionMM_accGradParameters_frame)(
+ THTensor *gradOutput,
+ THTensor *gradWeight,
+ THTensor *gradBias,
+ THTensor *finput,
+ real scale)
+{
+ long i;
+ THTensor *gradOutput2d = THTensor_(newWithStorage2d)
+ (gradOutput->storage, gradOutput->storageOffset,
+ gradOutput->size[0], -1,
+ gradOutput->size[1]*gradOutput->size[2], -1);
+
+ THTensor *tfinput = THTensor_(new)();
+ THTensor_(transpose)(tfinput, finput, 0, 1);
+ THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutput2d, tfinput);
+ THTensor_(free)(tfinput);
+
+ if (gradBias) {
+ for(i = 0; i < gradBias->size[0]; i++)
+ {
+ long k;
+ real sum = 0;
+ real *data = gradOutput2d->storage->data + gradOutput2d->storageOffset + i*gradOutput2d->stride[0];
+ for(k = 0; k < gradOutput2d->size[1]; k++)
+ sum += data[k];
+ (gradBias->storage->data + gradBias->storageOffset)[i] += scale*sum;
+ }
+ }
+
+ THTensor_(free)(gradOutput2d);
+}
+
+void THNN_(SpatialConvolutionMM_accGradParameters)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradWeight,
+ THTensor *gradBias,
+ THTensor *finput,
+ THTensor *fgradInput,
+ int kW,
+ int kH,
+ int dW,
+ int dH,
+ int padW,
+ int padH,
+ accreal scale_)
+{
+ THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous");
+ if (gradBias)
+ THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous");
+
+ real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+ gradWeight = THNN_(view_weight_MM2d)(gradWeight);
+
+ THNN_(SpatialConvolutionMM_shapeCheck)
+ (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW);
+
+ input = THTensor_(newContiguous)(input);
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+
+ if(input->nDimension == 3)
+ {
+ THNN_(SpatialConvolutionMM_accGradParameters_frame)(gradOutput, gradWeight,
+ gradBias, finput, scale);
+ }
+ else
+ {
+ long T = input->size[0];
+ long t;
+
+ for(t = 0; t < T; t++)
+ {
+ THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+ THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+ THNN_(SpatialConvolutionMM_accGradParameters_frame)(gradOutput_t, gradWeight,
+ gradBias, finput_t, scale);
+
+ THTensor_(free)(gradOutput_t);
+ THTensor_(free)(finput_t);
+ }
+ }
+
+ THTensor_(free)(input);
+ THTensor_(free)(gradOutput);
+ THTensor_(free)(gradWeight);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialConvolutionMap.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialConvolutionMap.c
new file mode 100644
index 000000000..142a03551
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialConvolutionMap.c
@@ -0,0 +1,277 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialConvolutionMap.c"
+#else
+
+void THNN_(SpatialConvolutionMap_updateOutput)(
+ THNNState *state, THTensor *input, THTensor *output, THTensor *weight, THTensor *bias,
+ THTensor *connTable, int nInputPlane, int nOutputPlane,
+ int dW, int dH)
+{
+ THArgCheck(
+ weight != NULL && weight->nDimension == 3
+ && connTable != NULL && connTable->size[0] == weight->size[0], 4,
+ "3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
+ );
+
+ int dimw = 2;
+ int dimh = 1;
+ int dimc = 0;
+ long nbatch = 1;
+
+ THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D(batch mode) tensor expected");
+
+ if (input->nDimension == 4)
+ {
+ nbatch = input->size[0];
+ dimc++;
+ dimw++;
+ dimh++;
+ }
+
+ const long kH = weight->size[1];
+ const long kW = weight->size[2];
+
+ THArgCheck(input->size[dimc] >= nInputPlane, 2, "invalid number of input planes");
+ THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH, 2, "input image smaller than kernel size");
+
+ const long input_w = input->size[dimw];
+ const long input_h = input->size[dimh];
+ const long output_w = (input_w - kW) / dW + 1;
+ const long output_h = (input_h - kH) / dH + 1;
+
+ if (input->nDimension == 3)
+ THTensor_(resize3d)(output, nOutputPlane, output_h, output_w);
+ else
+ THTensor_(resize4d)(output, input->size[0], nOutputPlane, output_h, output_w);
+
+ /* contiguous */
+ input = THTensor_(newContiguous)(input);
+ output = THTensor_(newContiguous)(output);
+ weight = THTensor_(newContiguous)(weight);
+ bias = bias ? THTensor_(newContiguous)(bias) : bias;
+ connTable = THTensor_(newContiguous)(connTable);
+
+ /* get raw pointers */
+ real *input_data = THTensor_(data)(input);
+ real *output_data = THTensor_(data)(output);
+ real *weight_data = THTensor_(data)(weight);
+ real *bias_data = THTensor_(data)(bias);
+ real *connTable_data = THTensor_(data)(connTable);
+
+ long p;
+#pragma omp parallel for private(p)
+ for (p = 0; p < nOutputPlane; p++)
+ {
+ long m;
+ for (m = 0; m < nbatch; m++)
+ {
+ /* add bias */
+ real *ptr_output = output_data + p*output_w*output_h + m*nOutputPlane*output_w*output_h;
+ long j, k;
+ real z= bias_data[p];
+ for (j = 0; j < output_h*output_w; j++)
+ ptr_output[j] = z;
+
+ /* convolve all maps */
+ int nweight = connTable->size[0];
+ for (k = 0; k < nweight; k++)
+ {
+ /* get offsets for input/output */
+ int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE;
+ int i = (int)connTable_data[k*2+0] - TH_INDEX_BASE;
+
+ if (o == p)
+ {
+ THTensor_(validXCorr2Dptr)(
+ output_data + o*output_w*output_h + m*nOutputPlane*output_w*output_h,
+ 1.0,
+ input_data + i*input_w*input_h + m*nInputPlane*input_w*input_h, input_h, input_w,
+ weight_data + k*kW*kH,
+ kH, kW,
+ dH, dW
+ );
+ }
+ }
+ }
+ }
+
+ /* clean up */
+ THTensor_(free)(input);
+ THTensor_(free)(output);
+ THTensor_(free)(weight);
+ if (bias) THTensor_(free)(bias);
+ THTensor_(free)(connTable);
+}
+
+void THNN_(SpatialConvolutionMap_updateGradInput)(
+ THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *weight, THTensor *bias,
+ THTensor *connTable, int nInputPlane, int nOutputPlane,
+ int dW, int dH)
+{
+ THArgCheck(
+ weight != NULL && weight->nDimension == 3
+ && connTable != NULL && connTable->size[0] == weight->size[0], 5,
+ "3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
+ );
+
+ /* and dims */
+ int dimw = 2;
+ int dimh = 1;
+ long nbatch = 1;
+ if (input->nDimension == 4)
+ {
+ nbatch = input->size[0];
+ dimw++;
+ dimh++;
+ }
+
+ const long input_h = input->size[dimh];
+ const long input_w = input->size[dimw];
+ const long output_h = gradOutput->size[dimh];
+ const long output_w = gradOutput->size[dimw];
+ const long kH = weight->size[1];
+ const long kW = weight->size[2];
+
+ /* contiguous */
+ gradInput = THTensor_(newContiguous)(gradInput);
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+ weight = THTensor_(newContiguous)(weight);
+ connTable = THTensor_(newContiguous)(connTable);
+
+ /* Resize/Zero */
+ THTensor_(resizeAs)(gradInput, input);
+ THTensor_(zero)(gradInput);
+
+ /* get raw pointers */
+ real *gradInput_data = THTensor_(data)(gradInput);
+ real *gradOutput_data = THTensor_(data)(gradOutput);
+ real *weight_data = THTensor_(data)(weight);
+ real *connTable_data = THTensor_(data)(connTable);
+
+ long p;
+#pragma omp parallel for private(p)
+ for (p = 0; p < nInputPlane; p++)
+ {
+ long m;
+ for (m = 0; m < nbatch; m++)
+ {
+ long k;
+ /* backward all */
+ int nkernel = connTable->size[0];
+ for (k = 0; k < nkernel; k++)
+ {
+ int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE;
+ int i = (int)connTable_data[k*2+0] - TH_INDEX_BASE;
+ if (i == p)
+ {
+ /* gradient to input */
+ THTensor_(fullConv2Dptr)(
+ gradInput_data + i*input_w*input_h + m*nInputPlane*input_w*input_h, 1.0,
+ gradOutput_data + o*output_w*output_h + m*nOutputPlane*output_w*output_h, output_h, output_w,
+ weight_data + k*kW*kH, kH, kW, dH, dW
+ );
+ }
+ }
+ }
+ }
+
+ /* clean up */
+ THTensor_(free)(gradInput);
+ THTensor_(free)(gradOutput);
+ THTensor_(free)(weight);
+ THTensor_(free)(connTable);
+}
+
+void THNN_(SpatialConvolutionMap_accGradParameters)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradWeight,
+ THTensor *gradBias,
+ THTensor *connTable,
+ int nInputPlane,
+ int nOutputPlane,
+ int dW, int dH,
+ accreal scale_)
+{
+ real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+ THArgCheck(
+ gradWeight != NULL && gradWeight->nDimension == 3
+ && connTable != NULL && connTable->size[0] == gradWeight->size[0], 5,
+ "3D gradWeight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
+ );
+
+ /* and dims */
+ int dimw = 2;
+ int dimh = 1;
+ long nbatch = 1;
+ if (input->nDimension == 4)
+ {
+ nbatch = input->size[0];
+ dimw++;
+ dimh++;
+ }
+
+ const long input_h = input->size[dimh];
+ const long input_w = input->size[dimw];
+ const long output_h = gradOutput->size[dimh];
+ const long output_w = gradOutput->size[dimw];
+ const long kH = gradWeight->size[1];
+ const long kW = gradWeight->size[2];
+
+ /* contiguous */
+ input = THTensor_(newContiguous)(input);
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+ THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous");
+ THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous");
+
+ /* get raw pointers */
+ real *input_data = THTensor_(data)(input);
+ real *gradOutput_data = THTensor_(data)(gradOutput);
+ real *gradWeight_data = THTensor_(data)(gradWeight);
+ real *gradBias_data = THTensor_(data)(gradBias);
+
+
+ long k;
+ /* gradients wrt bias */
+#pragma omp parallel for private(k)
+ for (k = 0; k < nOutputPlane; k++)
+ {
+ long m;
+ for (m = 0; m < nbatch; m++)
+ {
+ real *ptr_gradOutput = gradOutput_data + k*output_w*output_h + m*nOutputPlane*output_w*output_h;
+ long l;
+ for (l = 0; l < output_h*output_w; l++)
+ gradBias_data[k] += scale*ptr_gradOutput[l];
+ }
+ }
+
+ /* gradients wrt weight */
+ const int nkernel = connTable->size[0];
+#pragma omp parallel for private(k)
+ for (k = 0; k < nkernel; k++)
+ {
+ long m;
+ for (m = 0; m < nbatch; m++)
+ {
+ int o = (int)THTensor_(get2d)(connTable,k,1) - TH_INDEX_BASE;
+ int i = (int)THTensor_(get2d)(connTable,k,0) - TH_INDEX_BASE;
+
+ /* gradient to kernel */
+ THTensor_(validXCorr2DRevptr)(
+ gradWeight_data + k*kW*kH,
+ scale,
+ input_data + i*input_w*input_h + m*nInputPlane*input_w*input_h, input_h, input_w,
+ gradOutput_data + o*output_w*output_h + m*nOutputPlane*output_w*output_h , output_h, output_w,
+ dH, dW
+ );
+ }
+ }
+
+ /* clean up */
+ THTensor_(free)(input);
+ THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialDepthWiseConvolution.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialDepthWiseConvolution.c
new file mode 100644
index 000000000..efb66a3e3
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialDepthWiseConvolution.c
@@ -0,0 +1,528 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialDepthWiseConvolution.c"
+#else
+
+static inline void THNN_(SpatialDepthWiseConvolution_shapeCheck)(
+ THTensor *input, THTensor *gradOutput,
+ THTensor *weight, THTensor *bias,
+ int kH, int kW, int dH, int dW, int padH, int padW) {
+
+ THArgCheck(kW > 0 && kH > 0, 9,
+ "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+ THArgCheck(dW > 0 && dH > 0, 11,
+ "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+ THNN_ARGCHECK(weight->nDimension == 4, 5, weight,
+ "2D or 4D weight tensor expected, but got: %s");
+
+ if (bias != NULL) {
+ THNN_CHECK_DIM_SIZE(bias, 2, 0, weight->size[0]);
+ THNN_CHECK_DIM_SIZE(bias, 2, 1, weight->size[1]);
+ }
+
+ int ndim = input->nDimension;
+ int dimf = 0;
+ int dimh = 1;
+ int dimw = 2;
+
+ if (ndim == 4) {
+ dimf++;
+ dimh++;
+ dimw++;
+ }
+
+ THNN_ARGCHECK(ndim == 3 || ndim == 4, 2, input,
+ "3D or 4D input tensor expected but got: %s");
+
+ long nInputPlane = weight->size[1];
+ long inputHeight = input->size[dimh];
+ long inputWidth = input->size[dimw];
+ long nOutputPlane = weight->size[0];
+ long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+ long outputWidth = (inputWidth + 2*padW - kW) / dW + 1;
+
+ if (outputWidth < 1 || outputHeight < 1)
+ THError("Given input size: (%d x %d x %d). "
+ "Calculated output size: (%d x %d x %d). Output size is too small",
+ nInputPlane,inputHeight,inputWidth,nOutputPlane*nInputPlane,outputHeight,outputWidth);
+
+ THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
+
+ if (gradOutput != NULL) {
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim + 1, dimf, nInputPlane);
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim + 1, dimh, nOutputPlane);
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim + 1, dimw, outputHeight);
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim + 1, dimw + 1, outputWidth);
+ }
+}
+
+static void THNN_(SpatialDepthWiseConvolution_updateOutput_frame)(
+ THTensor *input,
+ THTensor *output,
+ THTensor *weight,
+ THTensor *bias,
+ THTensor *finput,
+ int kW,
+ int kH,
+ int dW,
+ int dH,
+ int padW,
+ int padH,
+ long nInputPlane,
+ long inputWidth,
+ long inputHeight,
+ long nOutputPlane,
+ long outputWidth,
+ long outputHeight)
+{
+ long i;
+ THTensor *output2d;
+
+ THNN_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH,
+ nInputPlane, inputWidth, inputHeight,
+ outputWidth, outputHeight);
+
+ output2d = THTensor_(newWithStorage2d)(output->storage, output->storageOffset,
+ nOutputPlane, -1,
+ outputHeight*outputWidth, -1);
+ if (bias) {
+ for(i = 0; i < nOutputPlane; i++)
+ THVector_(fill)
+ (output->storage->data + output->storageOffset + output->stride[0] * i,
+ THTensor_(get1d)(bias, i), outputHeight*outputWidth);
+ } else {
+ THTensor_(zero)(output);
+ }
+
+ THTensor_(addmm)(output2d, 1, output2d, 1, weight, finput);
+
+ THTensor_(free)(output2d);
+}
+
+void THNN_(SpatialDepthWiseConvolution_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THTensor *weight,
+ THTensor *bias,
+ THTensor *finput,
+ THTensor *fgradInput,
+ int kW,
+ int kH,
+ int dW,
+ int dH,
+ int padW,
+ int padH)
+{
+ long nInputPlane = weight->nDimension == 2 ? weight->size[1]/(kH*kW) : weight->size[1];
+ long nOutputPlane = weight->size[0];
+ if (weight->nDimension == 2) {
+ THTensor_(resize4d)(weight, nOutputPlane, nInputPlane, kH, kW);
+ }
+
+ THNN_(SpatialDepthWiseConvolution_shapeCheck)
+ (input, NULL, weight, bias, kH, kW, dH, dW, padH, padW);
+
+ THTensor *_weight = THTensor_(newTranspose)(weight, 0, 1);
+ weight = THTensor_(newContiguous)(_weight);
+
+ THTensor *_bias = NULL;
+ if(bias) {
+ _bias = THTensor_(newTranspose)(bias, 0, 1);
+ bias = THTensor_(newContiguous)(_bias);
+ }
+
+ // resize weight
+ long s1 = weight->size[0];
+ long s2 = weight->size[1];
+ long s3 = weight->size[2] * weight->size[3];
+ weight = THTensor_(newWithStorage3d)(weight->storage, weight->storageOffset,
+ s1, -1, s2, -1, s3, -1);
+
+ input = THTensor_(newContiguous)(input);
+
+ int ndim = input->nDimension;
+
+ int batch = 1;
+ if (ndim == 3) {
+ // Force batch
+ batch = 0;
+ THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+ }
+
+ long inputHeight = input->size[3];
+ long inputWidth = input->size[2];
+ long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+ long outputWidth = (inputWidth + 2*padW - kW) / dW + 1;
+
+ long T = input->size[0];
+ long t;
+
+ THTensor_(resize5d)(output, T, nInputPlane, nOutputPlane, outputHeight, outputWidth);
+ THTensor_(resize4d)(finput, T, nInputPlane, kW*kH*1, outputHeight*outputWidth);
+
+#pragma omp parallel for private(t)
+ for(t = 0; t < T; t++)
+ {
+ THTensor *input_t = THTensor_(newSelect)(input, 0, t);
+ THTensor *output_t = THTensor_(newSelect)(output, 0, t);
+ THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+ long i;
+#pragma omp parallel for private(i)
+ for(i = 0; i < nInputPlane; i++)
+ {
+ THTensor *weight_i = THTensor_(newSelect)(weight, 0, i);
+ THTensor *input_i = THTensor_(newNarrow)(input_t, 0, i, 1);
+ THTensor *output_i = THTensor_(newSelect)(output_t, 0, i);
+ THTensor *finput_i = THTensor_(newSelect)(finput_t, 0, i);
+ THTensor *bias_i = NULL;
+ if(bias) {
+ bias_i = THTensor_(newSelect)(bias, 0, i);
+ }
+ THNN_(SpatialDepthWiseConvolution_updateOutput_frame)
+ (input_i, output_i, weight_i, bias_i, finput_i,
+ kW, kH, dW, dH, padW, padH,
+ 1, inputWidth, inputHeight,
+ nOutputPlane, outputWidth, outputHeight);
+
+ THTensor_(free)(input_i);
+ THTensor_(free)(weight_i);
+ THTensor_(free)(bias_i);
+ THTensor_(free)(output_i);
+ THTensor_(free)(finput_i);
+ }
+ THTensor_(free)(input_t);
+ THTensor_(free)(output_t);
+ THTensor_(free)(finput_t);
+ }
+
+ THTensor_(free)(weight);
+ THTensor_(free)(_weight);
+ THTensor_(free)(bias);
+ THTensor_(free)(_bias);
+ THTensor_(resize4d)(output, T, nInputPlane * nOutputPlane, outputHeight, outputWidth);
+
+ if (batch == 0) {
+ THTensor_(select)(output, NULL, 0, 0);
+ THTensor_(select)(input, NULL, 0, 0);
+ THTensor_(select)(finput, NULL, 0, 0);
+ }
+ THTensor_(free)(input);
+}
+
+static void THNN_(SpatialDepthWiseConvolution_updateGradInput_frame)(
+ THTensor *gradInput,
+ THTensor *gradOutput,
+ THTensor *weight,
+ THTensor *fgradInput,
+ int kW,
+ int kH,
+ int dW,
+ int dH,
+ int padW,
+ int padH)
+{
+ THTensor *gradOutput2d = THTensor_(newWithStorage2d)
+ (gradOutput->storage, gradOutput->storageOffset,
+ gradOutput->size[0], -1,
+ gradOutput->size[1]*gradOutput->size[2], -1);
+ THTensor_(addmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput2d);
+ THTensor_(free)(gradOutput2d);
+
+ THTensor_(zero)(gradInput);
+
+ THNN_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH,
+ padW, padH,
+ gradInput->size[0], gradInput->size[2], gradInput->size[1],
+ gradOutput->size[2], gradOutput->size[1]);
+}
+
+void THNN_(SpatialDepthWiseConvolution_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *weight,
+ THTensor *finput,
+ THTensor *fgradInput,
+ int kW,
+ int kH,
+ int dW,
+ int dH,
+ int padW,
+ int padH)
+{
+ long nInputPlane = weight->nDimension == 2 ? weight->size[1]/(kH*kW) : weight->size[1];
+ long nOutputPlane = weight->size[0];
+ if (weight->nDimension == 2) {
+ THTensor_(resize4d)(weight, nOutputPlane, nInputPlane, kH, kW);
+ }
+ gradOutput = THTensor_(newWithTensor)(gradOutput);
+
+ if (input->nDimension == 3) {
+ if (gradOutput->nDimension == 3) {
+ THTensor_(resize4d)(gradOutput, nInputPlane, nOutputPlane, gradOutput->size[1], gradOutput->size[2]);
+ }
+ }
+ else
+ {
+ if (gradOutput->nDimension == 4) {
+ THTensor_(resize5d)(gradOutput, gradOutput->size[0], nInputPlane, nOutputPlane, gradOutput->size[2], gradOutput->size[3]);
+ }
+ }
+
+
+ THNN_(SpatialDepthWiseConvolution_shapeCheck)
+ (input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW);
+
+ THTensor *_weight = THTensor_(newTranspose)(weight, 0, 1);
+ weight = THTensor_(newContiguous)(_weight);
+
+
+ // resize weight
+ long s1 = weight->size[0];
+ long s2 = weight->size[1];
+ long s3 = weight->size[2] * weight->size[3];
+ weight = THTensor_(newWithStorage3d)(weight->storage, weight->storageOffset,
+ s1, -1, s2, -1, s3, -1);
+
+ input = THTensor_(newContiguous)(input);
+
+ int batch = 1;
+ if (input->nDimension == 3) {
+ // Force batch
+ batch = 0;
+ THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+ THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+ }
+
+ long inputHeight = input->size[3];
+ long inputWidth = input->size[2];
+ long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+ long outputWidth = (inputWidth + 2*padW - kW) / dW + 1;
+
+ long T = input->size[0];
+ long t;
+
+ THTensor_(resizeAs)(gradInput, input);
+ THTensor_(resize4d)(fgradInput, T, nInputPlane, kW*kH*1, outputHeight*outputWidth);
+
+ // depending on the BLAS library, fgradInput (result tensor) might
+ // be left uninitialized on zero alpha, which might lead to weird behavior
+ // hence, to be safe, zero it
+ THTensor_(zero)(fgradInput);
+
+
+
+#pragma omp parallel for private(t)
+ for(t = 0; t < T; t++)
+ {
+ THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
+ THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+ THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
+
+
+ long i;
+#pragma omp parallel for private(i)
+ for(i = 0; i < nInputPlane; i++)
+ {
+ THTensor *weight_i = THTensor_(newSelect)(weight, 0, i);
+ THTensor *gradInput_i = THTensor_(newNarrow)(gradInput_t, 0, i, 1);
+ THTensor *gradOutput_i = THTensor_(newSelect)(gradOutput_t, 0, i);
+ THTensor *fgradInput_i = THTensor_(newSelect)(fgradInput_t, 0, i);
+
+ THTensor_(transpose)(weight_i, weight_i, 0, 1);
+
+ THNN_(SpatialDepthWiseConvolution_updateGradInput_frame)(gradInput_i, gradOutput_i,
+ weight_i, fgradInput_i,
+ kW, kH, dW, dH, padW, padH);
+
+ THTensor_(free)(gradInput_i);
+ THTensor_(free)(weight_i);
+ THTensor_(free)(gradOutput_i);
+ THTensor_(free)(fgradInput_i);
+ }
+
+ THTensor_(free)(gradInput_t);
+ THTensor_(free)(gradOutput_t);
+ THTensor_(free)(fgradInput_t);
+ }
+
+ if (batch == 0) {
+ THTensor_(select)(gradOutput, NULL, 0, 0);
+ THTensor_(select)(input, NULL, 0, 0);
+ THTensor_(select)(gradInput, NULL, 0, 0);
+ THTensor_(select)(fgradInput, NULL, 0, 0);
+ }
+
+ THTensor_(free)(input);
+ THTensor_(free)(gradOutput);
+ THTensor_(free)(weight);
+ THTensor_(free)(_weight);
+}
+
+static void THNN_(SpatialDepthWiseConvolution_accGradParameters_frame)(
+ THTensor *gradOutput,
+ THTensor *gradWeight,
+ THTensor *gradBias,
+ THTensor *finput,
+ accreal scale)
+{
+ long i;
+ THTensor *gradOutput2d = THTensor_(newWithStorage2d)
+ (gradOutput->storage, gradOutput->storageOffset,
+ gradOutput->size[0], -1,
+ gradOutput->size[1]*gradOutput->size[2], -1);
+
+ THTensor_(transpose)(finput, finput, 0, 1);
+ THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutput2d, finput);
+ THTensor_(transpose)(finput, finput, 0, 1);
+
+ if (gradBias) {
+ for(i = 0; i < gradBias->size[0]; i++)
+ {
+ long k;
+ real sum = 0;
+ real *data = gradOutput2d->storage->data + gradOutput2d->storageOffset + i*gradOutput2d->stride[0];
+ for(k = 0; k < gradOutput2d->size[1]; k++)
+ sum += data[k];
+ (gradBias->storage->data + gradBias->storageOffset)[i] += scale*sum;
+ }
+ }
+
+ THTensor_(free)(gradOutput2d);
+}
+
+void THNN_(SpatialDepthWiseConvolution_accGradParameters)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradWeight,
+ THTensor *gradBias,
+ THTensor *finput,
+ THTensor *fgradInput,
+ int kW,
+ int kH,
+ int dW,
+ int dH,
+ int padW,
+ int padH,
+ accreal scale)
+{
+ long nInputPlane = gradWeight->nDimension == 2 ? gradWeight->size[1]/(kH*kW) : gradWeight->size[1];
+ long nOutputPlane = gradWeight->size[0];
+ if (gradWeight->nDimension == 2) {
+ THTensor_(resize4d)(gradWeight, nOutputPlane, nInputPlane, kH, kW);
+ }
+
+ gradOutput = THTensor_(newWithTensor)(gradOutput);
+ if (input->nDimension == 3) {
+ if (gradOutput->nDimension == 3) {
+ THTensor_(resize4d)(gradOutput, nInputPlane, nOutputPlane, gradOutput->size[1], gradOutput->size[2]);
+ }
+ }
+ else
+ {
+ if (gradOutput->nDimension == 4) {
+ THTensor_(resize5d)(gradOutput, gradOutput->size[0], nInputPlane, nOutputPlane, gradOutput->size[2], gradOutput->size[3]);
+ }
+ }
+
+
+ THNN_(SpatialDepthWiseConvolution_shapeCheck)
+ (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW);
+
+ // Transpose gradWeight & gradBias
+ THTensor_(transpose)(gradWeight, NULL, 0, 1);
+ THTensor *_gradWeight;
+ _gradWeight = gradWeight;
+ gradWeight = THTensor_(newContiguous)(gradWeight);
+
+ THTensor *_gradBias = NULL;
+ if(gradBias) {
+ THTensor_(transpose)(gradBias, NULL, 0, 1);
+ _gradBias = gradBias;
+ gradBias = THTensor_(newContiguous)(gradBias);
+ }
+
+ // resize gradWeight
+ long s1 = gradWeight->size[0];
+ long s2 = gradWeight->size[1];
+ long s3 = gradWeight->size[2] * gradWeight->size[3];
+ gradWeight = THTensor_(newWithStorage3d)(gradWeight->storage, gradWeight->storageOffset,
+ s1, -1, s2, -1, s3, -1);
+
+ input = THTensor_(newContiguous)(input);
+
+
+ int batch = 1;
+ if (input->nDimension == 3) {
+ // Force batch
+ batch = 0;
+ THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+ THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+ }
+
+ long inputHeight = input->size[3];
+ long inputWidth = input->size[2];
+ long outputHeight = (inputHeight + 2*padH - kH) / dH + 1;
+ long outputWidth = (inputWidth + 2*padW - kW) / dW + 1;
+
+ long T = input->size[0];
+ long t;
+ THTensor_(resize4d)(finput, T, nInputPlane, kW*kH*1, outputHeight*outputWidth);
+
+ for(t = 0; t < T; t++)
+ {
+ THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+ THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+ long i;
+#pragma omp parallel for private(i)
+ for(i = 0; i < nInputPlane; i++)
+ {
+ THTensor *finput_i = THTensor_(newSelect)(finput_t, 0, i);
+ THTensor *gradOutput_i = THTensor_(newSelect)(gradOutput_t, 0, i);
+ THTensor *gradWeight_i = THTensor_(newSelect)(gradWeight, 0, i);
+ THTensor *gradBias_i = NULL;
+ if(gradBias) {
+ gradBias_i = THTensor_(newSelect)(gradBias, 0, i);
+ }
+ THNN_(SpatialDepthWiseConvolution_accGradParameters_frame)(gradOutput_i, gradWeight_i,
+ gradBias_i, finput_i, scale);
+
+ THTensor_(free)(finput_i);
+ THTensor_(free)(gradOutput_i);
+ THTensor_(free)(gradWeight_i);
+ THTensor_(free)(gradBias_i);
+ }
+
+ THTensor_(free)(gradOutput_t);
+ THTensor_(free)(finput_t);
+ }
+
+ // Copy back and transpose back
+ THTensor_(transpose)(_gradWeight, NULL, 0, 1);
+ THTensor_(resize4d)(_gradWeight, nInputPlane, nOutputPlane, kH, kW);
+ THTensor_(copy)(_gradWeight, gradWeight);
+ THTensor_(transpose)(_gradWeight, NULL, 0, 1);
+
+ if(gradBias) {
+ THTensor_(transpose)(_gradBias, NULL, 0, 1);
+ THTensor_(resize2d)(_gradBias, nInputPlane, nOutputPlane);
+ THTensor_(copy)(_gradBias, gradBias);
+ THTensor_(transpose)(_gradBias, NULL, 0, 1);
+ }
+
+ if (batch == 0) {
+ THTensor_(select)(gradOutput, NULL, 0, 0);
+ THTensor_(select)(input, NULL, 0, 0);
+ THTensor_(select)(finput, NULL, 0, 0);
+ }
+
+ THTensor_(free)(input);
+ THTensor_(free)(gradOutput);
+ THTensor_(free)(gradWeight);
+ THTensor_(free)(gradBias);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialDilatedConvolution.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialDilatedConvolution.c
new file mode 100644
index 000000000..897cc0da4
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialDilatedConvolution.c
@@ -0,0 +1,408 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialDilatedConvolution.c"
+#else
+
+static inline void THNN_(SpatialDilatedConvolution_shapeCheck)(
+ THTensor *input, THTensor *gradOutput,
+ THTensor *weight, THTensor *bias,
+ int kH, int kW, int dH, int dW, int padH, int padW,
+ int dilationH, int dilationW) {
+
+ THNN_ARGCHECK(weight->nDimension == 4, 4, weight,
+ "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, "
+ "but got: %s");
+ THArgCheck(kW > 0 && kH > 0, 9,
+ "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+ THArgCheck(dW > 0 && dH > 0, 11,
+ "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+ THArgCheck(dilationW > 0 && dilationH > 0, 15,
+ "dilation should be greater than zero, but got dilationH: %d, dilationW: %d",
+ dilationH, dilationW);
+
+ if (bias != NULL) {
+ THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]);
+ }
+
+ int ndim = input->nDimension;
+ int dimf = 0;
+ int dimh = 1;
+ int dimw = 2;
+
+ if (ndim == 4) {
+ dimf++;
+ dimh++;
+ dimw++;
+ }
+
+ THNN_ARGCHECK(ndim == 3 || ndim == 4, 2, input,
+ "3D or 4D input tensor expected but got: %s");
+
+ long nInputPlane = weight->size[1];
+ long inputHeight = input->size[dimh];
+ long inputWidth = input->size[dimw];
+ long nOutputPlane = weight->size[0];
+ long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+ long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+ if (outputWidth < 1 || outputHeight < 1)
+ THError("Given input size: (%ld x %ld x %ld). "
+ "Calculated output size: (%ld x %ld x %ld). Output size is too small",
+ nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth);
+
+ THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
+
+ if (gradOutput != NULL) {
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+ }
+}
+
+void THNN_(SpatialDilatedConvolution_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THTensor *weight,
+ THTensor *bias,
+ THTensor *columns,
+ THTensor *ones,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ int dilationW, int dilationH)
+{
+
+ THNN_(SpatialDilatedConvolution_shapeCheck)
+ (input, NULL, weight, bias, kH, kW, dH, dW, padH, padW,
+ dilationH, dilationW);
+
+ // Params:
+ int nInputPlane = weight->size[1];
+ int nOutputPlane = weight->size[0];
+
+ input = THTensor_(newContiguous)(input);
+ weight = THTensor_(newContiguous)(weight);
+ bias = bias ? THTensor_(newContiguous)(bias) : bias;
+ int batch = 1;
+ if (input->nDimension == 3) {
+ // Force batch
+ batch = 0;
+ THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+ }
+ long inputWidth = input->size[3];
+ long inputHeight = input->size[2];
+ long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+ long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+ // Batch size + input planes
+ long batchSize = input->size[0];
+
+ // Resize output
+ THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth);
+ THTensor_(zero)(output);
+
+ // Resize temporary columns
+ THTensor_(resize2d)(columns, nInputPlane*kW*kH, outputHeight*outputWidth);
+
+ // Define a buffer of ones, for bias accumulation
+ // Note: this buffer can be shared with other modules, it only ever gets increased,
+ // and always contains ones.
+ if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+ // Resize plane and fill with ones...
+ THTensor_(resize2d)(ones, outputHeight, outputWidth);
+ THTensor_(fill)(ones, 1);
+ }
+
+ // Helpers
+ THTensor *input_n = THTensor_(new)();
+ THTensor *output_n = THTensor_(new)();
+
+ // For each elt in batch, do:
+ for (int elt = 0; elt < batchSize; elt ++) {
+ // Matrix mulitply per output:
+ THTensor_(select)(input_n, input, 0, elt);
+ THTensor_(select)(output_n, output, 0, elt);
+
+ // Do Bias first:
+ // M,N,K are dims of matrix A and B
+ long m_ = nOutputPlane;
+ long n_ = outputHeight * outputWidth;
+ long k_ = 1;
+
+ // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+ if (bias) {
+ THBlas_(gemm)(
+ 't', 'n',
+ n_, m_, k_,
+ 1,
+ THTensor_(data)(ones), k_,
+ THTensor_(data)(bias), k_,
+ 0,
+ THTensor_(data)(output_n), n_
+ );
+ } else {
+ THTensor_(zero)(output_n);
+ }
+
+ // Extract columns:
+ THNN_(im2col)(
+ THTensor_(data)(input_n),
+ nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+ dilationH, dilationW,
+ THTensor_(data)(columns)
+ );
+
+ // M,N,K are dims of matrix A and B
+ long m = nOutputPlane;
+ long n = columns->size[1];
+ long k = nInputPlane*kH*kW;
+
+ // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+ THBlas_(gemm)(
+ 'n', 'n',
+ n, m, k,
+ 1,
+ THTensor_(data)(columns), n,
+ THTensor_(data)(weight), k,
+ 1,
+ THTensor_(data)(output_n), n
+ );
+ }
+
+ // Free
+ THTensor_(free)(input_n);
+ THTensor_(free)(output_n);
+
+ // Resize output
+ if (batch == 0) {
+ THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
+ THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
+ }
+
+ THTensor_(free)(input);
+ THTensor_(free)(weight);
+ if (bias) THTensor_(free)(bias);
+}
+
+void THNN_(SpatialDilatedConvolution_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *weight,
+ THTensor *gradColumns,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ int dilationW, int dilationH)
+{
+ THNN_(SpatialDilatedConvolution_shapeCheck)
+ (input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW,
+ dilationH, dilationW);
+
+ // Params
+ int nInputPlane = weight->size[1];
+ int nOutputPlane = weight->size[0];
+
+ input = THTensor_(newContiguous)(input);
+ weight = THTensor_(newContiguous)(weight);
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+ int batch = 1;
+ if (input->nDimension == 3) {
+ // Force batch
+ batch = 0;
+ THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+ THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1],
+ gradOutput->size[2]);
+ }
+
+ long inputWidth = input->size[3];
+ long inputHeight = input->size[2];
+ long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+ long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+ // Batch size + input planes
+ long batchSize = input->size[0];
+
+ // Resize output
+ THTensor_(resize4d)(gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
+
+ // Resize temporary columns
+ THTensor_(resize2d)(gradColumns, nInputPlane*kW*kH, outputHeight*outputWidth);
+ THTensor_(zero)(gradColumns);
+
+ // Helpers
+ THTensor *gradInput_n = THTensor_(new)();
+ THTensor *gradOutput_n = THTensor_(new)();
+
+ // For each elt in batch, do:
+ for (int elt = 0; elt < batchSize; elt ++) {
+ // Matrix mulitply per sample:
+ THTensor_(select)(gradInput_n, gradInput, 0, elt);
+ THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+ // M,N,K are dims of matrix A and B
+ long m = nInputPlane*kW*kH;
+ long n = gradColumns->size[1];
+ long k = nOutputPlane;
+
+ // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+ THBlas_(gemm)(
+ 'n', 't',
+ n, m, k,
+ 1,
+ THTensor_(data)(gradOutput_n), n,
+ THTensor_(data)(weight), m,
+ 0,
+ THTensor_(data)(gradColumns), n
+ );
+
+ // Unpack columns back into input:
+ THNN_(col2im)(
+ THTensor_(data)(gradColumns),
+ nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+ dilationH, dilationW,
+ THTensor_(data)(gradInput_n)
+ );
+ }
+
+ // Free
+ THTensor_(free)(gradInput_n);
+ THTensor_(free)(gradOutput_n);
+
+ // Resize output
+ if (batch == 0) {
+ THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
+ THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
+ THTensor_(resize3d)(gradInput, nInputPlane, inputHeight, inputWidth);
+ }
+
+ THTensor_(free)(input);
+ THTensor_(free)(gradOutput);
+ THTensor_(free)(weight);
+}
+
+
+void THNN_(SpatialDilatedConvolution_accGradParameters)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradWeight,
+ THTensor *gradBias,
+ THTensor *columns,
+ THTensor *ones,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ int dilationW, int dilationH,
+ accreal scale_)
+{
+ real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+ THNN_(SpatialDilatedConvolution_shapeCheck)
+ (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW,
+ dilationH, dilationW);
+
+ // Params
+ int nInputPlane = gradWeight->size[1];
+ int nOutputPlane = gradWeight->size[0];
+
+ input = THTensor_(newContiguous)(input);
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+ THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous");
+ if (gradBias)
+ THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous");
+ int batch = 1;
+ if (input->nDimension == 3) {
+ // Force batch
+ batch = 0;
+ THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+ THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0],
+ gradOutput->size[1], gradOutput->size[2]);
+ }
+
+ long inputWidth = input->size[3];
+ long inputHeight = input->size[2];
+ long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+ long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+ // Batch size + input planes
+ long batchSize = input->size[0];
+
+ // Define a buffer of ones, for bias accumulation
+ if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+ // Resize plane and fill with ones...
+ THTensor_(resize2d)(ones, outputHeight, outputWidth);
+ THTensor_(fill)(ones, 1);
+ }
+
+ // Resize temporary columns
+ THTensor_(resize2d)(columns, nInputPlane*kW*kH, outputHeight*outputWidth);
+
+ // Helpers
+ THTensor *input_n = THTensor_(new)();
+ THTensor *gradOutput_n = THTensor_(new)();
+
+ // For each elt in batch, do:
+ for (int elt = 0; elt < batchSize; elt ++) {
+ // Matrix mulitply per output:
+ THTensor_(select)(input_n, input, 0, elt);
+ THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+ // Extract columns:
+ THNN_(im2col)(
+ THTensor_(data)(input_n),
+ nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+ dilationH, dilationW,
+ THTensor_(data)(columns)
+ );
+
+ // M,N,K are dims of matrix A and B
+ long m = nOutputPlane;
+ long n = nInputPlane*kW*kH;
+ long k = columns->size[1];
+
+ // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+ THBlas_(gemm)(
+ 't', 'n',
+ n, m, k,
+ scale,
+ THTensor_(data)(columns), k,
+ THTensor_(data)(gradOutput_n), k,
+ 1,
+ THTensor_(data)(gradWeight), n
+ );
+
+ // Do Bias:
+ // M,N,K are dims of matrix A and B
+ long m_ = nOutputPlane;
+ long k_ = outputHeight * outputWidth;
+
+ // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+ if (gradBias) {
+ THBlas_(gemv)(
+ 't',
+ k_, m_,
+ scale,
+ THTensor_(data)(gradOutput_n), k_,
+ THTensor_(data)(ones), 1,
+ 1,
+ THTensor_(data)(gradBias), 1
+ );
+ }
+ }
+
+ // Free
+ THTensor_(free)(input_n);
+ THTensor_(free)(gradOutput_n);
+
+ // Resize
+ if (batch == 0) {
+ THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
+ THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
+ }
+
+ THTensor_(free)(input);
+ THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialDilatedMaxPooling.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialDilatedMaxPooling.c
new file mode 100644
index 000000000..8f4ad13c3
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialDilatedMaxPooling.c
@@ -0,0 +1,401 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialDilatedMaxPooling.c"
+#else
+
+static inline void THNN_(SpatialDilatedMaxPooling_shapeCheck)(
+ THTensor *input, THTensor *gradOutput, THIndexTensor *indices,
+ int kH, int kW, int dH, int dW, int padH, int padW,
+ int dilationH, int dilationW, bool ceil_mode) {
+
+ THArgCheck(kW > 0 && kH > 0, 5,
+ "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+ THArgCheck(dW > 0 && dH > 0, 8,
+ "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+ THArgCheck(dilationH > 0 && dilationW > 0, 12,
+ "dilation should be greater than zero, but got dilationH: %d dilationW: %d",
+ dilationH, dilationW);
+
+ int ndim = input->nDimension;
+ int dimf = 0;
+ int dimh = 1;
+ int dimw = 2;
+
+ if (ndim == 4) {
+ dimf++;
+ dimh++;
+ dimw++;
+ }
+
+ THNN_ARGCHECK(ndim == 3 || ndim == 4, 2, input,
+ "3D or 4D input tensor expected but got: %s");
+
+ THArgCheck(kW/2 >= padW && kH/2 >= padH, 2,
+ "pad should be smaller than half of kernel size, but got "
+ "padW = %d, padH = %d, kW = %d, kH = %d",
+ padW, padH, kW, kH);
+
+ long nInputPlane = input->size[dimh-1];
+ long inputHeight = input->size[dimh];
+ long inputWidth = input->size[dimw];
+ long outputHeight, outputWidth;
+ long nOutputPlane = nInputPlane;
+
+ if (ceil_mode)
+ {
+ outputHeight = (long)(ceil((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1;
+ outputWidth = (long)(ceil((float)(inputWidth - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1;
+ }
+ else
+ {
+ outputHeight = (long)(floor((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1;
+ outputWidth = (long)(floor((float)(inputWidth - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1;
+ }
+
+ if (padW || padH)
+ {
+ // ensure that the last pooling starts inside the image
+ // needed to avoid problems in ceil mode
+ if ((outputHeight - 1)*dH >= inputHeight + padH)
+ --outputHeight;
+ if ((outputWidth - 1)*dW >= inputWidth + padW)
+ --outputWidth;
+ }
+
+ if (outputWidth < 1 || outputHeight < 1)
+ THError("Given input size: (%dx%dx%d). "
+ "Calculated output size: (%dx%dx%d). Output size is too small",
+ nInputPlane,inputHeight,inputWidth,nInputPlane,outputHeight,outputWidth);
+
+ if (gradOutput != NULL) {
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+ }
+ if (indices != NULL) {
+ THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimf, nOutputPlane);
+ THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimh, outputHeight);
+ THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimw, outputWidth);
+ }
+}
+
+static void THNN_(SpatialDilatedMaxPooling_updateOutput_frame)(
+ real *input_p,
+ real *output_p,
+ THIndex_t *ind_p,
+ long nslices,
+ long iwidth,
+ long iheight,
+ long owidth,
+ long oheight,
+ int kW,
+ int kH,
+ int dW,
+ int dH,
+ int padW,
+ int padH,
+ int dilationW,
+ int dilationH
+ )
+{
+ long k;
+#pragma omp parallel for private(k)
+ for (k = 0; k < nslices; k++)
+ {
+ /* loop over output */
+ long i, j;
+ real *ip = input_p + k*iwidth*iheight;
+ for(i = 0; i < oheight; i++)
+ {
+ for(j = 0; j < owidth; j++)
+ {
+ long hstart = i * dH - padH;
+ long wstart = j * dW - padW;
+ long hend = fminf(hstart + (kH - 1) * dilationH + 1, iheight);
+ long wend = fminf(wstart + (kW - 1) * dilationW + 1, iwidth);
+ while(hstart < 0)
+ hstart += dilationH;
+ while(wstart < 0)
+ wstart += dilationW;
+
+ /* local pointers */
+ real *op = output_p + k*owidth*oheight + i*owidth + j;
+ THIndex_t *indp = ind_p + k*owidth*oheight + i*owidth + j;
+
+ /* compute local max: */
+ long maxindex = -1;
+ real maxval = -THInf;
+ long tcntr = 0;
+ long x,y;
+ for(y = hstart; y < hend; y += dilationH)
+ {
+ for(x = wstart; x < wend; x += dilationW)
+ {
+ tcntr = y*iwidth + x;
+ real val = *(ip + tcntr);
+ if (val > maxval)
+ {
+ maxval = val;
+ maxindex = tcntr;
+ }
+ }
+ }
+
+ /* set output to local max */
+ *op = maxval;
+
+ /* store location of max */
+ *indp = maxindex + TH_INDEX_BASE;
+ }
+ }
+ }
+}
+
+void THNN_(SpatialDilatedMaxPooling_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THIndexTensor *indices,
+ int kW,
+ int kH,
+ int dW,
+ int dH,
+ int padW,
+ int padH,
+ int dilationW,
+ int dilationH,
+ bool ceil_mode)
+{
+
+ int dimw = 2;
+ int dimh = 1;
+ long nbatch = 1;
+ long nInputPlane;
+ long inputHeight;
+ long inputWidth;
+ long outputHeight;
+ long outputWidth;
+ real *input_data;
+ real *output_data;
+ THIndex_t *indices_data;
+
+ THNN_(SpatialDilatedMaxPooling_shapeCheck)
+ (input, NULL, NULL, kH, kW, dH, dW,
+ padH, padW, dilationH, dilationW, ceil_mode);
+
+ if (input->nDimension == 4)
+ {
+ nbatch = input->size[0];
+ dimw++;
+ dimh++;
+ }
+
+ /* sizes */
+ nInputPlane = input->size[dimh-1];
+ inputHeight = input->size[dimh];
+ inputWidth = input->size[dimw];
+ if (ceil_mode)
+ {
+ outputHeight = (long)(ceil((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1;
+ outputWidth = (long)(ceil((float)(inputWidth - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1;
+ }
+ else
+ {
+ outputHeight = (long)(floor((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1;
+ outputWidth = (long)(floor((float)(inputWidth - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1;
+ }
+
+ if (padW || padH)
+ {
+ // ensure that the last pooling starts inside the image
+ // needed to avoid problems in ceil mode
+ if ((outputHeight - 1)*dH >= inputHeight + padH)
+ --outputHeight;
+ if ((outputWidth - 1)*dW >= inputWidth + padW)
+ --outputWidth;
+ }
+
+ /* get contiguous input */
+ input = THTensor_(newContiguous)(input);
+
+ /* resize output */
+ if (input->nDimension == 3)
+ {
+ THTensor_(resize3d)(output, nInputPlane, outputHeight, outputWidth);
+ /* indices will contain the locations for each output point */
+ THIndexTensor_(resize3d)(indices, nInputPlane, outputHeight, outputWidth);
+
+ input_data = THTensor_(data)(input);
+ output_data = THTensor_(data)(output);
+ indices_data = THIndexTensor_(data)(indices);
+
+ THNN_(SpatialDilatedMaxPooling_updateOutput_frame)
+ (input_data, output_data,
+ indices_data,
+ nInputPlane,
+ inputWidth, inputHeight,
+ outputWidth, outputHeight,
+ kW, kH, dW, dH,
+ padW, padH,
+ dilationW, dilationH
+ );
+ }
+ else
+ {
+ long p;
+
+ THTensor_(resize4d)(output, nbatch, nInputPlane, outputHeight, outputWidth);
+ /* indices will contain the locations for each output point */
+ THIndexTensor_(resize4d)(indices, nbatch, nInputPlane, outputHeight, outputWidth);
+
+ input_data = THTensor_(data)(input);
+ output_data = THTensor_(data)(output);
+ indices_data = THIndexTensor_(data)(indices);
+
+#pragma omp parallel for private(p)
+ for (p = 0; p < nbatch; p++)
+ {
+ THNN_(SpatialDilatedMaxPooling_updateOutput_frame)
+ (input_data+p*nInputPlane*inputWidth*inputHeight,
+ output_data+p*nInputPlane*outputWidth*outputHeight,
+ indices_data+p*nInputPlane*outputWidth*outputHeight,
+ nInputPlane,
+ inputWidth, inputHeight,
+ outputWidth, outputHeight,
+ kW, kH, dW, dH,
+ padW, padH,
+ dilationW, dilationH
+ );
+ }
+ }
+
+ /* cleanup */
+ THTensor_(free)(input);
+}
+
+static void THNN_(SpatialDilatedMaxPooling_updateGradInput_frame)(
+ real *gradInput_p,
+ real *gradOutput_p,
+ THIndex_t *ind_p,
+ long nInputPlane,
+ long inputWidth,
+ long inputHeight,
+ long outputWidth,
+ long outputHeight,
+ int dW,
+ int dH)
+{
+ long k;
+#pragma omp parallel for private(k)
+ for (k = 0; k < nInputPlane; k++)
+ {
+ real *gradInput_p_k = gradInput_p + k*inputWidth*inputHeight;
+ real *gradOutput_p_k = gradOutput_p + k*outputWidth*outputHeight;
+ THIndex_t *ind_p_k = ind_p + k*outputWidth*outputHeight;
+
+ /* calculate max points */
+ long i, j;
+ for(i = 0; i < outputHeight; i++)
+ {
+ for(j = 0; j < outputWidth; j++)
+ {
+ /* retrieve position of max */
+ long maxp = ind_p_k[i*outputWidth + j] - TH_INDEX_BASE;
+ if (maxp != -1) {
+ /* update gradient */
+ gradInput_p_k[maxp] += gradOutput_p_k[i*outputWidth + j];
+ }
+ }
+ }
+ }
+}
+
+void THNN_(SpatialDilatedMaxPooling_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THIndexTensor *indices,
+ int kW,
+ int kH,
+ int dW,
+ int dH,
+ int padW,
+ int padH,
+ int dilationW,
+ int dilationH,
+ bool ceil_mode)
+{
+ int dimw = 2;
+ int dimh = 1;
+ long nbatch = 1;
+ int nInputPlane;
+ int inputHeight;
+ int inputWidth;
+ int outputHeight;
+ int outputWidth;
+ real *gradInput_data;
+ real *gradOutput_data;
+ THIndex_t *indices_data;
+
+ THNN_(SpatialDilatedMaxPooling_shapeCheck)
+ (input, gradOutput, indices, kH, kW, dH, dW,
+ padH, padW, dilationH, dilationW, ceil_mode);
+
+ /* get contiguous gradOutput */
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+
+ /* resize */
+ THTensor_(resizeAs)(gradInput, input);
+ THTensor_(zero)(gradInput);
+
+ if (input->nDimension == 4) {
+ nbatch = input->size[0];
+ dimw++;
+ dimh++;
+ }
+
+ /* sizes */
+ nInputPlane = input->size[dimh-1];
+ inputHeight = input->size[dimh];
+ inputWidth = input->size[dimw];
+ outputHeight = gradOutput->size[dimh];
+ outputWidth = gradOutput->size[dimw];
+
+ /* get raw pointers */
+ gradInput_data = THTensor_(data)(gradInput);
+ gradOutput_data = THTensor_(data)(gradOutput);
+ indices_data = THIndexTensor_(data)(indices);
+
+ /* backprop */
+ if (input->nDimension == 3)
+ {
+ THNN_(SpatialDilatedMaxPooling_updateGradInput_frame)
+ (gradInput_data, gradOutput_data,
+ indices_data,
+ nInputPlane,
+ inputWidth, inputHeight,
+ outputWidth, outputHeight,
+ dW, dH);
+ }
+ else
+ {
+ long p;
+#pragma omp parallel for private(p)
+ for (p = 0; p < nbatch; p++)
+ {
+ THNN_(SpatialDilatedMaxPooling_updateGradInput_frame)
+ (gradInput_data+p*nInputPlane*inputWidth*inputHeight,
+ gradOutput_data+p*nInputPlane*outputWidth*outputHeight,
+ indices_data+p*nInputPlane*outputWidth*outputHeight,
+ nInputPlane,
+ inputWidth, inputHeight,
+ outputWidth, outputHeight,
+ dW, dH);
+ }
+ }
+
+ /* cleanup */
+ THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialFractionalMaxPooling.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialFractionalMaxPooling.c
new file mode 100644
index 000000000..a98954cc6
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialFractionalMaxPooling.c
@@ -0,0 +1,253 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialFractionalMaxPooling.c"
+#else
+
+static long* THNN_(SpatialFractionalMaxPooling_generateIntervals)(
+ real sample,
+ long inputSize,
+ long outputSize,
+ int poolSize) {
+ real alpha = (real) (inputSize - poolSize) / (real) (outputSize - 1);
+ long* sequence = (long*) THAlloc(sizeof(long) * outputSize);
+
+ long i;
+ for (i = 0; i < outputSize - 1; ++i) {
+ sequence[i] =
+ (long) ((i + sample) * alpha) - (long) (sample * alpha);
+ }
+ sequence[outputSize - 1] = inputSize - poolSize;
+
+ return sequence;
+}
+
+static void THNN_(SpatialFractionalMaxPooling_updateOutput_frame)(
+ real* input,
+ real* output,
+ THIndex_t* indices,
+ real* randomSamples,
+ long numPlanes,
+ long inputW, long inputH,
+ long outputW, long outputH,
+ int poolSizeW, int poolSizeH) {
+ long plane;
+#pragma omp parallel for private(plane)
+ for (plane = 0; plane < numPlanes; ++plane) {
+ /* each plane contains 2 random samples, one for W and one for H */
+ real* randomSamplesForPlane = randomSamples + plane * 2;
+
+ /* Generate interval sequence */
+ long* sequenceW =
+ THNN_(SpatialFractionalMaxPooling_generateIntervals)(
+ randomSamplesForPlane[0], inputW, outputW, poolSizeW);
+ long* sequenceH =
+ THNN_(SpatialFractionalMaxPooling_generateIntervals)(
+ randomSamplesForPlane[1], inputH, outputH, poolSizeH);
+
+ /* loop over output */
+ long h, w;
+
+ real* inputForPlane = input + plane * inputW * inputH;
+ real* outputForPlane = output + plane * outputW * outputH;
+ THIndex_t* indicesForPlane = indices + plane * outputW * outputH;
+
+ for (h = 0; h < outputH; ++h) {
+ long inputHStart = sequenceH[h];
+
+ for (w = 0; w < outputW; ++w) {
+ long inputWStart = sequenceW[w];
+
+ real maxVal = -THInf;
+ long maxIndex = -1;
+
+ long h2, w2;
+ for (h2 = inputHStart; h2 < inputHStart + poolSizeH; ++h2) {
+ for (w2 = inputWStart; w2 < inputWStart + poolSizeW; ++w2) {
+ THAssert(h2 >= 0 && h2 < inputH);
+ THAssert(w2 >= 0 && w2 < inputW);
+
+ long planeIndex = h2 * inputW + w2;
+ real val = inputForPlane[planeIndex];
+ if (val > maxVal) {
+ maxVal = val;
+ maxIndex = planeIndex;
+ }
+ }
+ }
+
+ THAssert(maxVal != -THInf);
+ THAssert(maxIndex != -1);
+
+ outputForPlane[h * outputW + w] = maxVal;
+ /* +1 to lua index */
+ indicesForPlane[h * outputW + w] = maxIndex + TH_INDEX_BASE;
+ }
+ }
+
+ THFree(sequenceW);
+ THFree(sequenceH);
+ }
+}
+
+void THNN_(SpatialFractionalMaxPooling_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ int outputW, int outputH,
+ int poolSizeW, int poolSizeH,
+ THIndexTensor *indices,
+ THTensor *randomSamples) {
+
+ long numBatch = 1;
+ int planeDim = 0;
+ int heightDim = 1;
+ int widthDim = 2;
+
+ long numInputDims = THTensor_(nDimension)(input);
+ THNN_ARGCHECK(numInputDims == 3 || numInputDims == 4, 2, input,
+ "3D or 4D (batch mode) tensor expected for input, but got: %s");
+
+ if (numInputDims == 4) {
+ numBatch = THTensor_(size)(input, 0);
+ planeDim++;
+ heightDim++;
+ widthDim++;
+ }
+
+ /* sizes */
+ long numPlanes = THTensor_(size)(input, planeDim);
+ long inputH = THTensor_(size)(input, heightDim);
+ long inputW = THTensor_(size)(input, widthDim);
+
+ THArgCheck(outputH + poolSizeH - 1 < inputH, 7,
+ "poolSizeH (%d) too large relative to input height (%d)",
+ poolSizeH, inputH);
+ THArgCheck(outputW + poolSizeW - 1 < inputW, 6,
+ "poolSizeW (%d) too large relative to input width (%d)",
+ poolSizeW, inputW);
+
+ /* get contiguous input */
+ input = THTensor_(newContiguous)(input);
+
+ if (numInputDims == 3) {
+ /* resize output */
+ THTensor_(resize3d)(output, numPlanes, outputH, outputW);
+ /* indices will contain the locations for each output point */
+ THIndexTensor_(resize3d)(indices, numPlanes, outputH, outputW);
+
+ THNN_(SpatialFractionalMaxPooling_updateOutput_frame)(
+ THTensor_(data)(input),
+ THTensor_(data)(output),
+ THIndexTensor_(data)(indices),
+ THTensor_(data)(randomSamples),
+ numPlanes, inputW, inputH, outputW, outputH, poolSizeW, poolSizeH);
+ } else {
+ THTensor_(resize4d)(output, numBatch, numPlanes, outputH, outputW);
+ /* indices will contain the locations for each output point */
+ THIndexTensor_(resize4d)(indices, numBatch, numPlanes, outputH, outputW);
+
+ long batch;
+#pragma omp parallel for private(batch)
+ for (batch = 0; batch < numBatch; ++batch) {
+ THNN_(SpatialFractionalMaxPooling_updateOutput_frame)(
+ THTensor_(data)(input) + batch * numPlanes * inputH * inputW,
+ THTensor_(data)(output) + batch * numPlanes * outputH * outputW,
+ THIndexTensor_(data)(indices) + batch * numPlanes * outputH * outputW,
+ THTensor_(data)(randomSamples) + batch * numPlanes * 2,
+ numPlanes, inputW, inputH, outputW, outputH, poolSizeW, poolSizeH);
+ }
+ }
+
+ /* cleanup */
+ THTensor_(free)(input);
+}
+
+static void THNN_(SpatialFractionalMaxPooling_updateGradInput_frame)(
+ real* gradInput,
+ real* gradOutput,
+ THIndex_t* indices,
+ long numPlanes,
+ long inputW, long inputH,
+ long outputW, long outputH) {
+ long plane;
+#pragma omp parallel for private(plane)
+ for (plane = 0; plane < numPlanes; plane++) {
+ real* gradInputForPlane = gradInput + plane * inputW * inputH;
+ real* gradOutputForPlane = gradOutput + plane * outputW * outputH;
+ THIndex_t* indicesForPlane = indices + plane * outputW * outputH;
+
+ long h, w;
+ for (h = 0; h < outputH; ++h) {
+ for (w = 0; w < outputW; ++w) {
+ long outputIndex = h * outputW + w;
+ long index = indicesForPlane[outputIndex] - TH_INDEX_BASE;
+ THAssert(index >= 0 && index < inputW * inputH);
+
+ gradInputForPlane[index] += gradOutputForPlane[outputIndex];
+ }
+ }
+ }
+}
+
+void THNN_(SpatialFractionalMaxPooling_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ int outputW, int outputH,
+ int poolSizeW, int poolSizeH,
+ THIndexTensor *indices) {
+
+ long numBatch = 1;
+ int planeDim = 0;
+ int heightDim = 1;
+ int widthDim = 2;
+
+ long numInputDims = THTensor_(nDimension)(input);
+ if (numInputDims == 4) {
+ numBatch = THTensor_(size)(input, 0);
+ planeDim = 1;
+ heightDim++;
+ widthDim++;
+ }
+
+ /* sizes */
+ long numPlanes = THTensor_(size)(input, planeDim);
+ long inputH = THTensor_(size)(input, heightDim);
+ long inputW = THTensor_(size)(input, widthDim);
+
+ THArgCheck(outputW == THTensor_(size)(gradOutput, widthDim), 3,
+ "gradOutput width unexpected");
+ THArgCheck(outputH == THTensor_(size)(gradOutput, heightDim), 3,
+ "gradOutput height unexpected");
+
+ /* get contiguous gradOutput */
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+
+ /* resize */
+ THTensor_(resizeAs)(gradInput, input);
+ THTensor_(zero)(gradInput);
+
+ /* backprop */
+ if (numInputDims == 3) {
+ THNN_(SpatialFractionalMaxPooling_updateGradInput_frame)(
+ THTensor_(data)(gradInput),
+ THTensor_(data)(gradOutput),
+ THIndexTensor_(data)(indices),
+ numPlanes, inputW, inputH, outputW, outputH);
+ } else {
+ long batch;
+#pragma omp parallel for private(batch)
+ for (batch = 0; batch < numBatch; ++batch) {
+ THNN_(SpatialFractionalMaxPooling_updateGradInput_frame)(
+ THTensor_(data)(gradInput) + batch * numPlanes * inputH * inputW,
+ THTensor_(data)(gradOutput) + batch * numPlanes * outputH * outputW,
+ THIndexTensor_(data)(indices) + batch * numPlanes * outputH * outputW,
+ numPlanes, inputW, inputH, outputW, outputH);
+ }
+ }
+
+ /* cleanup */
+ THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialFullConvolution.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialFullConvolution.c
new file mode 100644
index 000000000..2edc53b5a
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialFullConvolution.c
@@ -0,0 +1,462 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialFullConvolution.c"
+#else
+
+static void THNN_(im2col)(const real* data_im, const int channels,
+ const int height, const int width, const int kernel_h, const int kernel_w,
+ const int pad_h, const int pad_w,
+ const int stride_h, const int stride_w,
+ const int dilation_h, const int dilation_w,
+ real* data_col) {
+ const int height_col = (height + 2 * pad_h -
+ (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+ const int width_col = (width + 2 * pad_w -
+ (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+ const int channels_col = channels * kernel_h * kernel_w;
+ for (int c_col = 0; c_col < channels_col; ++c_col) {
+ int w_offset = c_col % kernel_w;
+ int h_offset = (c_col / kernel_w) % kernel_h;
+ int c_im = c_col / kernel_h / kernel_w;
+ for (int h_col = 0; h_col < height_col; ++h_col) {
+ for (int w_col = 0; w_col < width_col; ++w_col) {
+ int h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
+ int w_im = w_col * stride_w - pad_w + w_offset * dilation_w;
+ data_col[(c_col * height_col + h_col) * width_col + w_col] =
+ (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ?
+ data_im[(c_im * height + h_im) * width + w_im] : 0;
+ }
+ }
+ }
+}
+
+static void THNN_(col2im)(const real* data_col, const int channels,
+ const int height, const int width, const int kernel_h, const int kernel_w,
+ const int pad_h, const int pad_w,
+ const int stride_h, const int stride_w,
+ const int dilation_h, const int dilation_w,
+ real* data_im) {
+ memset(data_im, 0, sizeof(real) * height * width * channels);
+ const int height_col = (height + 2 * pad_h -
+ (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+ const int width_col = (width + 2 * pad_w -
+ (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+ const int channels_col = channels * kernel_h * kernel_w;
+ for (int c_col = 0; c_col < channels_col; ++c_col) {
+ int w_offset = c_col % kernel_w;
+ int h_offset = (c_col / kernel_w) % kernel_h;
+ int c_im = c_col / kernel_h / kernel_w;
+ for (int h_col = 0; h_col < height_col; ++h_col) {
+ for (int w_col = 0; w_col < width_col; ++w_col) {
+ int h_im = h_col * stride_h - pad_h + h_offset * dilation_h;
+ int w_im = w_col * stride_w - pad_w + w_offset * dilation_w;
+ if (h_im >= 0 && h_im < height && w_im >= 0 && w_im < width)
+ data_im[(c_im * height + h_im) * width + w_im] +=
+ data_col[(c_col * height_col + h_col) * width_col + w_col];
+ }
+ }
+ }
+}
+
+static inline void THNN_(SpatialFullConvolution_shapeCheck)(
+ THTensor *input, THTensor *gradOutput,
+ THTensor *weight, THTensor *bias,
+ int kH, int kW, int dH, int dW, int padH, int padW, int adjH, int adjW) {
+
+ THArgCheck(kW > 0 && kH > 0, 9,
+ "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW);
+ THArgCheck(dW > 0 && dH > 0, 11,
+ "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+ THArgCheck(adjW < dW && adjH < dH, 15,
+ "output adjustment must be smaller than stride, but got adjH: %d adjW: %d dH: %d dW: %d",
+ adjH, adjW, dH, dW);
+ THNN_ARGCHECK(weight->nDimension == 2 || weight->nDimension == 4, 5, weight,
+ "2D or 4D weight tensor expected, but got: %s");
+
+ if (bias != NULL) {
+ THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[1]);
+ }
+
+ int ndim = input->nDimension;
+ int dimf = 0;
+ int dimh = 1;
+ int dimw = 2;
+
+ if (ndim == 4) {
+ dimf++;
+ dimh++;
+ dimw++;
+ }
+
+ THNN_ARGCHECK(ndim == 3 || ndim == 4, 2, input,
+ "3D or 4D input tensor expected but got: %s");
+
+ long nInputPlane = weight->size[0];
+ long inputHeight = input->size[dimh];
+ long inputWidth = input->size[dimw];
+ long nOutputPlane = weight->size[1];
+ long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
+ long outputWidth = (inputWidth - 1) * dW - 2*padW + kW + adjW;
+
+ if (outputWidth < 1 || outputHeight < 1)
+ THError("Given input size: (%d x %d x %d). "
+ "Calculated output size: (%d x %d x %d). Output size is too small",
+ nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth);
+
+ THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
+
+ if (gradOutput != NULL) {
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+ }
+}
+
+void THNN_(SpatialFullConvolution_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THTensor *weight,
+ THTensor *bias,
+ THTensor *columns,
+ THTensor *ones,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ int adjW, int adjH)
+{
+ THNN_(SpatialFullConvolution_shapeCheck)
+ (input, NULL, weight, bias, kH, kW, dH, dW, padH, padW, adjH, adjW);
+
+ int nInputPlane = THTensor_(size)(weight,0);
+ int nOutputPlane = THTensor_(size)(weight,1);
+
+ input = THTensor_(newContiguous)(input);
+ weight = THTensor_(newContiguous)(weight);
+ bias = bias ? THTensor_(newContiguous)(bias) : bias;
+ int batch = 1;
+ if (input->nDimension == 3) {
+ // Force batch
+ batch = 0;
+ THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+ }
+
+ long inputHeight = input->size[2];
+ long inputWidth = input->size[3];
+ long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
+ long outputWidth = (inputWidth - 1) * dW - 2*padW + kW + adjW;
+
+ // Batch size + input planes
+ long batchSize = input->size[0];
+
+ // Resize output
+ THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth);
+
+ // Resize temporary columns
+ THTensor_(resize2d)(columns, nOutputPlane*kW*kH, inputHeight*inputWidth);
+ THTensor_(zero)(columns);
+
+ // Define a buffer of ones, for bias accumulation
+ // Note: this buffer can be shared with other modules, it only ever gets increased,
+ // and always contains ones.
+ if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+ // Resize plane and fill with ones...
+ THTensor_(resize2d)(ones, outputHeight, outputWidth);
+ THTensor_(fill)(ones, 1);
+ }
+
+ // Helpers
+ THTensor *input_n = THTensor_(new)();
+ THTensor *output_n = THTensor_(new)();
+
+ int elt;
+ // For each elt in batch, do:
+ for (elt = 0; elt < batchSize; elt ++) {
+ // Matrix mulitply per output:
+ THTensor_(select)(input_n, input, 0, elt);
+ THTensor_(select)(output_n, output, 0, elt);
+
+ // M,N,K are dims of matrix A and B
+ // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+ long m = weight->size[1] * weight->size[2] * weight->size[3];
+ long n = columns->size[1];
+ long k = weight->size[0];
+
+ // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+ THBlas_(gemm)(
+ 'n', 't',
+ n, m, k,
+ 1,
+ THTensor_(data)(input_n), n,
+ THTensor_(data)(weight), m,
+ 0,
+ THTensor_(data)(columns), n
+ );
+
+ // Unpack columns back into input:
+ THNN_(col2im)(
+ THTensor_(data)(columns),
+ nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
+ 1, 1,
+ THTensor_(data)(output_n)
+ );
+
+ // Do Bias after:
+ // M,N,K are dims of matrix A and B
+ // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+ long m_ = nOutputPlane;
+ long n_ = outputHeight * outputWidth;
+ long k_ = 1;
+
+ // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+ if (bias) {
+ THBlas_(gemm)(
+ 't', 'n',
+ n_, m_, k_,
+ 1,
+ THTensor_(data)(ones), k_,
+ THTensor_(data)(bias), k_,
+ 1,
+ THTensor_(data)(output_n), n_
+ );
+ }
+ }
+
+ // Free
+ THTensor_(free)(input_n);
+ THTensor_(free)(output_n);
+
+ // Resize output
+ if (batch == 0) {
+ THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth);
+ THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
+ }
+
+ THTensor_(free)(input);
+ THTensor_(free)(weight);
+ if (bias) THTensor_(free)(bias);
+}
+
+void THNN_(SpatialFullConvolution_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *weight,
+ THTensor *gradColumns,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ int adjW, int adjH)
+{
+ THNN_(SpatialFullConvolution_shapeCheck)
+ (input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW, adjH, adjW);
+
+ int nInputPlane = THTensor_(size)(weight,0);
+ int nOutputPlane = THTensor_(size)(weight,1);
+
+ input = THTensor_(newContiguous)(input);
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+ weight = THTensor_(newContiguous)(weight);
+ int batch = 1;
+ if (input->nDimension == 3) {
+ // Force batch
+ batch = 0;
+ THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+ THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+ }
+
+ long inputWidth = input->size[3];
+ long inputHeight = input->size[2];
+ long outputWidth = (inputWidth - 1) * dW - 2*padW + kW + adjW;
+ long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
+
+ // Batch size + input planes
+ long batchSize = input->size[0];
+
+ // Resize output
+ THTensor_(resize4d)(gradInput, batchSize, nInputPlane, inputHeight, inputWidth);
+ THTensor_(zero)(gradInput);
+
+ // Resize temporary columns
+ THTensor_(resize2d)(gradColumns, nOutputPlane*kW*kH, inputHeight*inputWidth);
+
+ // Helpers
+ THTensor *gradInput_n = THTensor_(new)();
+ THTensor *gradOutput_n = THTensor_(new)();
+
+ int elt;
+ // For each elt in batch, do:
+ for (elt = 0; elt < batchSize; elt ++) {
+ // Matrix mulitply per sample:
+ THTensor_(select)(gradInput_n, gradInput, 0, elt);
+ THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+ // Extract columns:
+ THNN_(im2col)(
+ THTensor_(data)(gradOutput_n),
+ nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
+ 1, 1,
+ THTensor_(data)(gradColumns)
+ );
+
+
+ // M,N,K are dims of matrix A and B
+ // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+ long m = weight->size[0];
+ long n = gradColumns->size[1];
+ long k = weight->size[1] * weight->size[2] * weight->size[3];
+
+ // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+ THBlas_(gemm)(
+ 'n', 'n',
+ n, m, k,
+ 1,
+ THTensor_(data)(gradColumns), n,
+ THTensor_(data)(weight), k,
+ 0,
+ THTensor_(data)(gradInput_n), n
+ );
+ }
+
+
+ // Free
+ THTensor_(free)(gradInput_n);
+ THTensor_(free)(gradOutput_n);
+
+ // Resize output
+ if (batch == 0) {
+ THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
+ THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
+ THTensor_(resize3d)(gradInput, nInputPlane, inputHeight, inputWidth);
+ }
+
+ THTensor_(free)(input);
+ THTensor_(free)(gradOutput);
+ THTensor_(free)(weight);
+}
+
+
+void THNN_(SpatialFullConvolution_accGradParameters)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradWeight,
+ THTensor *gradBias,
+ THTensor *columns,
+ THTensor *ones,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ int adjW, int adjH,
+ accreal scale_)
+{
+ real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+ THNN_(SpatialFullConvolution_shapeCheck)
+ (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW, adjH, adjW);
+
+ int nInputPlane = THTensor_(size)(gradWeight,0);
+ int nOutputPlane = THTensor_(size)(gradWeight,1);
+
+ input = THTensor_(newContiguous)(input);
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+ THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous");
+ if (gradBias)
+ THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous");
+ int batch = 1;
+ if (input->nDimension == 3) {
+ // Force batch
+ batch = 0;
+ THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]);
+ THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+ }
+
+ long inputWidth = input->size[3];
+ long inputHeight = input->size[2];
+ long outputWidth = (inputWidth - 1) * dW - 2*padW + kW + adjW;
+ long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH;
+
+ // Batch size + input planes
+ long batchSize = input->size[0];
+
+ // Define a buffer of ones, for bias accumulation
+ if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) {
+ // Resize plane and fill with ones...
+ THTensor_(resize2d)(ones, outputHeight, outputWidth);
+ THTensor_(fill)(ones, 1);
+ }
+
+ // Resize temporary columns
+ THTensor_(resize2d)(columns, nOutputPlane*kW*kH, inputHeight*inputWidth);
+
+ // Helpers
+ THTensor *input_n = THTensor_(new)();
+ THTensor *gradOutput_n = THTensor_(new)();
+
+ int elt;
+ // For each elt in batch, do:
+ for (elt = 0; elt < batchSize; elt ++) {
+ // Matrix mulitply per output:
+ THTensor_(select)(input_n, input, 0, elt);
+ THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+ // Extract columns:
+ THNN_(im2col)(
+ THTensor_(data)(gradOutput_n),
+ nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW,
+ 1, 1,
+ THTensor_(data)(columns)
+ );
+
+ // M,N,K are dims of matrix A and B
+ // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+ long n = columns->size[0]; // nOutputPlane * kh * kw
+ long m = input_n->size[0]; // nInputPlane
+ long k = columns->size[1]; // inputHeight * inputWidth
+
+ // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+ THBlas_(gemm)(
+ 't', 'n',
+ n, m, k,
+ scale,
+ THTensor_(data)(columns), k,
+ THTensor_(data)(input_n), k,
+ 1,
+ THTensor_(data)(gradWeight), n
+ );
+
+
+ // Do Bias:
+ // M,N,K are dims of matrix A and B
+ // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+ long m_ = nOutputPlane;
+ long k_ = outputHeight * outputWidth;
+
+ // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+ if (gradBias) {
+ THBlas_(gemv)(
+ 't',
+ k_, m_,
+ scale,
+ THTensor_(data)(gradOutput_n), k_,
+ THTensor_(data)(ones), 1,
+ 1,
+ THTensor_(data)(gradBias), 1
+ );
+ }
+ }
+
+ // Free
+ THTensor_(free)(input_n);
+ THTensor_(free)(gradOutput_n);
+
+ // Resize
+ if (batch == 0) {
+ THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth);
+ THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth);
+ }
+
+ THTensor_(free)(input);
+ THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialFullConvolutionMap.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialFullConvolutionMap.c
new file mode 100644
index 000000000..6952fbe25
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialFullConvolutionMap.c
@@ -0,0 +1,222 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialFullConvolutionMap.c"
+#else
+
+void THNN_(SpatialFullConvolutionMap_updateOutput)(
+ THNNState *state, THTensor *input, THTensor *output_, THTensor *weight, THTensor *bias,
+ THTensor *connTable, int nInputPlane, int nOutputPlane,
+ int dW, int dH)
+{
+ THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous");
+ THArgCheck(!bias || THTensor_(isContiguous)(bias), 5, "bias must be contiguous");
+ THArgCheck(
+ weight != NULL && weight->nDimension == 3
+ && connTable != NULL && connTable->size[0] == weight->size[0], 4,
+ "3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
+ );
+
+ const int kH = (int)weight->size[1];
+ const int kW = (int)weight->size[2];
+
+ THArgCheck(input != NULL && input->nDimension == 3, 2, "3D tensor expected");
+ THArgCheck(input->size[0] >= nInputPlane, 2, "invalid number of input planes");
+
+ THTensor_(resize3d)(
+ output_, nOutputPlane,
+ (input->size[1] - 1) * dH + kH,
+ (input->size[2] - 1) * dW + kW
+ );
+
+ /* contiguous */
+ input = THTensor_(newContiguous)(input);
+ THTensor* output = THTensor_(newContiguous)(output_);
+
+ /* get raw pointers */
+ real *input_data = THTensor_(data)(input);
+ real *output_data = THTensor_(data)(output);
+ real *weight_data = THTensor_(data)(weight);
+ real *bias_data = THTensor_(data)(bias);
+ real *connTable_data = THTensor_(data)(connTable);
+
+ /* and dims */
+ const long input_h = input->size[1];
+ const long input_w = input->size[2];
+ const long output_h = output->size[1];
+ const long output_w = output->size[2];
+ const long weight_h = weight->size[1];
+ const long weight_w = weight->size[2];
+
+ long p;
+#pragma omp parallel for private(p)
+ for (p = 0; p < nOutputPlane; p++)
+ {
+ /* add bias */
+ real *ptr_output = output_data + p*output_w*output_h;
+ long j;
+ int nweight;
+ long k;
+
+ for (j = 0; j < output_h*output_w; j++)
+ ptr_output[j] = bias_data[p];
+
+ /* convolve all maps */
+ nweight = connTable->size[0];
+ for (k = 0; k < nweight; k++)
+ {
+ /* get offsets for input/output */
+ int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE;
+ int i = (int)connTable_data[k*2+0] - TH_INDEX_BASE;
+
+ if (o == p)
+ {
+ THTensor_(fullConv2Dptr)(
+ output_data + o*output_w*output_h,
+ 1.0,
+ input_data + i*input_w*input_h, input_h, input_w,
+ weight_data + k*weight_w*weight_h, weight_h, weight_w,
+ dH, dW
+ );
+ }
+ }
+ }
+
+ /* clean up */
+ THTensor_(free)(input);
+ THTensor_(freeCopyTo)(output, output_);
+}
+
+void THNN_(SpatialFullConvolutionMap_updateGradInput)(
+ THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput_, THTensor *weight, THTensor *bias,
+ THTensor *connTable, int nInputPlane, int nOutputPlane,
+ int dW, int dH)
+{
+ THArgCheck(
+ weight != NULL && weight->nDimension == 3
+ && connTable != NULL && connTable->size[0] == weight->size[0], 5,
+ "3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
+ );
+
+ /* contiguous */
+ THTensor* gradInput = THTensor_(newContiguous)(gradInput_);
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+
+ /* Resize/Zero */
+ THTensor_(resizeAs)(gradInput, input);
+ THTensor_(zero)(gradInput);
+
+ /* get raw pointers */
+ real *gradInput_data = THTensor_(data)(gradInput);
+ real *gradOutput_data = THTensor_(data)(gradOutput);
+ real *weight_data = THTensor_(data)(weight);
+ real *connTable_data = THTensor_(data)(connTable);
+
+ /* and dims */
+ const long input_h = input->size[1];
+ const long input_w = input->size[2];
+ const long output_h = gradOutput->size[1];
+ const long output_w = gradOutput->size[2];
+ const long kH = weight->size[1];
+ const long kW = weight->size[2];
+
+ long p;
+#pragma omp parallel for private(p)
+ for (p = 0; p < nInputPlane; p++)
+ {
+ long k;
+ /* backward all */
+ int nkernel = connTable->size[0];
+ for (k = 0; k < nkernel; k++)
+ {
+ int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE;
+ int i = (int)connTable_data[k*2+0] - TH_INDEX_BASE;
+ if (i == p)
+ {
+ /* gradient to input */
+ THTensor_(validXCorr2Dptr)(
+ gradInput_data + i*input_w*input_h,
+ 1.0,
+ gradOutput_data + o*output_w*output_h, output_h, output_w,
+ weight_data + k*kW*kH, kH, kW,
+ dH, dW
+ );
+ }
+ }
+ }
+
+ /* clean up */
+ THTensor_(freeCopyTo)(gradInput, gradInput_);
+ THTensor_(free)(gradOutput);
+}
+
+void THNN_(SpatialFullConvolutionMap_accGradParameters)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradWeight,
+ THTensor *gradBias,
+ THTensor *connTable,
+ int nInputPlane,
+ int nOutputPlane,
+ int dW, int dH,
+ accreal scale_)
+{
+ real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+ THArgCheck(
+ gradWeight != NULL && gradWeight->nDimension == 3
+ && connTable != NULL && connTable->size[0] == gradWeight->size[0], 5,
+ "3D gradWeight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE
+ );
+
+ /* contiguous */
+ input = THTensor_(newContiguous)(input);
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+
+ /* get raw pointers */
+ real *input_data = THTensor_(data)(input);
+ real *gradOutput_data = THTensor_(data)(gradOutput);
+ real *gradWeight_data = THTensor_(data)(gradWeight);
+ real *gradBias_data = THTensor_(data)(gradBias);
+
+ /* and dims */
+ const long input_h = input->size[1];
+ const long input_w = input->size[2];
+ const long output_h = gradOutput->size[1];
+ const long output_w = gradOutput->size[2];
+ const long weight_h = gradWeight->size[1];
+ const long weight_w = gradWeight->size[2];
+
+ /* gradients wrt bias */
+ long k;
+#pragma omp parallel for private(k)
+ for (k = 0; k < nOutputPlane; k++)
+ {
+ real *ptr_gradOutput = gradOutput_data + k*output_w*output_h;
+ long l;
+ for (l = 0; l < output_h*output_w; l++)
+ gradBias_data[k] += scale*ptr_gradOutput[l];
+ }
+
+ /* gradients wrt weight */
+ int nkernel = connTable->size[0];
+#pragma omp parallel for private(k)
+ for (k = 0; k < nkernel; k++)
+ {
+ int o = (int)THTensor_(get2d)(connTable,k,1) - TH_INDEX_BASE;
+ int i = (int)THTensor_(get2d)(connTable,k,0) - TH_INDEX_BASE;
+
+ /* gradient to kernel */
+ THTensor_(validXCorr2DRevptr)(
+ gradWeight_data + k*weight_w*weight_h,
+ scale,
+ gradOutput_data + o*output_w*output_h, output_h, output_w,
+ input_data + i*input_w*input_h, input_h, input_w,
+ dH, dW
+ );
+ }
+
+ /* clean up */
+ THTensor_(free)(input);
+ THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialMaxPooling.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialMaxPooling.c
new file mode 100644
index 000000000..88aaa40e1
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialMaxPooling.c
@@ -0,0 +1,44 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialMaxPooling.c"
+#else
+
+void THNN_(SpatialMaxPooling_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THIndexTensor *indices,
+ int kW,
+ int kH,
+ int dW,
+ int dH,
+ int padW,
+ int padH,
+ bool ceil_mode)
+{
+ THNN_(SpatialDilatedMaxPooling_updateOutput)(
+ state, input, output, indices,
+ kW, kH, dW, dH, padW, padH, 1, 1, ceil_mode
+ );
+}
+
+void THNN_(SpatialMaxPooling_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THIndexTensor *indices,
+ int kW,
+ int kH,
+ int dW,
+ int dH,
+ int padW,
+ int padH,
+ bool ceil_mode)
+{
+ THNN_(SpatialDilatedMaxPooling_updateGradInput)(
+ state, input, gradOutput, gradInput, indices,
+ kW, kH, dW, dH, padW, padH, 1, 1, ceil_mode
+ );
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialMaxUnpooling.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialMaxUnpooling.c
new file mode 100644
index 000000000..320538686
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialMaxUnpooling.c
@@ -0,0 +1,234 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialMaxUnpooling.c"
+#else
+
+static void THNN_(SpatialMaxUnpooling_updateOutput_frame)(real *input_p, real *output_p,
+ THIndex_t *ind_p,
+ int nslices,
+ int iwidth, int iheight,
+ int owidth, int oheight)
+{
+ int k;
+ int has_error = 0;
+ THIndex_t error_index;
+#pragma omp parallel for private(k)
+ for (k = 0; k < nslices; k++)
+ {
+ real *output_p_k = output_p + k*owidth*oheight;
+ real *input_p_k = input_p + k*iwidth*iheight;
+ THIndex_t *ind_p_k = ind_p + k*iwidth*iheight;
+
+ int i, j;
+ THIndex_t maxp;
+ for(i = 0; i < iheight; i++)
+ {
+ for(j = 0; j < iwidth; j++)
+ {
+ maxp = ind_p_k[i*iwidth + j] - TH_INDEX_BASE; /* retrieve position of max */
+ if(maxp<0 || maxp>=owidth*oheight){
+#pragma omp critical
+ {
+ has_error = 1;
+ error_index = maxp;
+ }
+ } else {
+ output_p_k[maxp] = input_p_k[i*iwidth + j]; /* update output */
+ }
+ }
+ }
+ }
+ if (has_error) {
+ THError("found an invalid max index %ld (output volumes are of size %dx%d)",
+ error_index, oheight, owidth);
+ }
+}
+
+void THNN_(SpatialMaxUnpooling_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THIndexTensor *indices,
+ int owidth, int oheight)
+{
+ int dimw = 2;
+ int dimh = 1;
+ int nbatch = 1;
+ int nslices;
+ int iheight;
+ int iwidth;
+ real *input_data;
+ real *output_data;
+ THIndex_t *indices_data;
+
+
+ THNN_ARGCHECK(input->nDimension == 3 || input->nDimension == 4, 2, input,
+ "3D or 4D (batch mode) tensor expected for input, but got: %s");
+ THNN_CHECK_SHAPE_INDICES(input, indices);
+
+ if (input->nDimension == 4)
+ {
+ nbatch = input->size[0];
+ dimw++;
+ dimh++;
+ }
+
+ /* sizes */
+ nslices = input->size[dimh-1];
+ iheight = input->size[dimh];
+ iwidth = input->size[dimw];
+
+ /* get contiguous input and indices */
+ input = THTensor_(newContiguous)(input);
+ indices = THIndexTensor_(newContiguous)(indices);
+
+ /* resize output */
+ if (input->nDimension == 3)
+ {
+ THTensor_(resize3d)(output, nslices, oheight, owidth);
+ THTensor_(zero)(output);
+
+ input_data = THTensor_(data)(input);
+ output_data = THTensor_(data)(output);
+ indices_data = THIndexTensor_(data)(indices);
+
+ THNN_(SpatialMaxUnpooling_updateOutput_frame)(input_data, output_data,
+ indices_data,
+ nslices,
+ iwidth, iheight,
+ owidth, oheight);
+ }
+ else
+ {
+ int p;
+
+ THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
+ THTensor_(zero)(output);
+
+ input_data = THTensor_(data)(input);
+ output_data = THTensor_(data)(output);
+ indices_data = THIndexTensor_(data)(indices);
+
+ for (p = 0; p < nbatch; p++)
+ {
+ THNN_(SpatialMaxUnpooling_updateOutput_frame)(
+ input_data+p*nslices*iwidth*iheight,
+ output_data+p*nslices*owidth*oheight,
+ indices_data+p*nslices*iwidth*iheight,
+ nslices,
+ iwidth, iheight,
+ owidth, oheight);
+ }
+ }
+
+ /* cleanup */
+ THTensor_(free)(input);
+ THIndexTensor_(free)(indices);
+}
+
+static void THNN_(SpatialMaxUnpooling_updateGradInput_frame)(real *gradInput_p, real *gradOutput_p,
+ THIndex_t *ind_p,
+ int nslices,
+ int iwidth, int iheight,
+ int owidth, int oheight)
+{
+ int k;
+#pragma omp parallel for private(k)
+ for (k = 0; k < nslices; k++)
+ {
+ real *gradInput_p_k = gradInput_p + k*iwidth*iheight;
+ real *gradOutput_p_k = gradOutput_p + k*owidth*oheight;
+ THIndex_t *ind_p_k = ind_p + k*iwidth*iheight;
+
+ int i, j;
+ THIndex_t maxp;
+ for(i = 0; i < iheight; i++)
+ {
+ for(j = 0; j < iwidth; j++)
+ {
+ maxp = ind_p_k[i*iwidth + j] - TH_INDEX_BASE; /* retrieve position of max */
+ if(maxp < 0 || maxp >= owidth * oheight) {
+ THError("invalid max index %ld, owidth= %d, oheight= %d", maxp, owidth, oheight);
+ }
+ gradInput_p_k[i*iwidth + j] = gradOutput_p_k[maxp]; /* update gradient */
+ }
+ }
+ }
+}
+
+void THNN_(SpatialMaxUnpooling_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THIndexTensor *indices,
+ int owidth, int oheight)
+{
+ int dimw = 2;
+ int dimh = 1;
+ int nbatch = 1;
+ int nslices;
+ int iheight;
+ int iwidth;
+ real *gradInput_data;
+ real *gradOutput_data;
+ THIndex_t *indices_data;
+
+ THNN_CHECK_SHAPE_INDICES(input, indices);
+
+ /* get contiguous gradOutput and indices */
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+ indices = THIndexTensor_(newContiguous)(indices);
+
+ /* resize */
+ THTensor_(resizeAs)(gradInput, input);
+ THTensor_(zero)(gradInput);
+
+ if (input->nDimension == 4) {
+ nbatch = input->size[0];
+ dimw++;
+ dimh++;
+ }
+
+ /* sizes */
+ nslices = input->size[dimh-1];
+ iheight = input->size[dimh];
+ iwidth = input->size[dimw];
+
+ if(owidth!=gradOutput->size[dimw] || oheight!=gradOutput->size[dimh]){
+ THError("Inconsistent gradOutput size. oheight= %d, owidth= %d, gradOutput: %dx%d",
+ oheight, owidth, gradOutput->size[dimh], gradOutput->size[dimw]);
+ }
+
+ /* get raw pointers */
+ gradInput_data = THTensor_(data)(gradInput);
+ gradOutput_data = THTensor_(data)(gradOutput);
+ indices_data = THIndexTensor_(data)(indices);
+
+ /* backprop */
+ if (input->nDimension == 3)
+ {
+ THNN_(SpatialMaxUnpooling_updateGradInput_frame)(gradInput_data, gradOutput_data,
+ indices_data,
+ nslices,
+ iwidth, iheight,
+ owidth, oheight);
+ }
+ else
+ {
+ int p;
+ for (p = 0; p < nbatch; p++)
+ {
+ THNN_(SpatialMaxUnpooling_updateGradInput_frame)(gradInput_data+p*nslices*iwidth*iheight, gradOutput_data+p*nslices*owidth*oheight,
+ indices_data+p*nslices*iwidth*iheight,
+ nslices,
+ iwidth, iheight,
+ owidth, oheight);
+ }
+ }
+
+ /* cleanup */
+ THTensor_(free)(gradOutput);
+ THIndexTensor_(free)(indices);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialReflectionPadding.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialReflectionPadding.c
new file mode 100644
index 000000000..dcde660ea
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialReflectionPadding.c
@@ -0,0 +1,260 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialReflectionPadding.c"
+#else
+
+static void THNN_(SpatialReflectionPadding_updateOutput_frame)(
+ real *input_p, real *output_p,
+ long nslices,
+ long iwidth, long iheight,
+ long owidth, long oheight,
+ int pad_l, int pad_r,
+ int pad_t, int pad_b)
+{
+ int iStartX = fmax(0, -pad_l);
+ int iStartY = fmax(0, -pad_t);
+ int oStartX = fmax(0, pad_l);
+ int oStartY = fmax(0, pad_t);
+
+ long k, ip_x, ip_y;
+#pragma omp parallel for private(k, ip_x, ip_y)
+
+ for (k = 0; k < nslices; k++)
+ {
+ long i, j;
+ for (i = 0; i < oheight; i++) {
+ for (j = 0; j < owidth; j++) {
+ if (j < pad_l) {
+ ip_x = pad_l * 2 - j;
+ } else if (j >= pad_l && j < iwidth + pad_l) {
+ ip_x = j;
+ } else {
+ ip_x = (iwidth + pad_l - 1) * 2 - j;
+ }
+ ip_x = ip_x - oStartX + iStartX;
+
+ if (i < pad_t) {
+ ip_y = pad_t * 2 - i;
+ } else if (i >= pad_t && i < iheight + pad_t) {
+ ip_y = i;
+ } else {
+ ip_y = (iheight + pad_t - 1) * 2 - i;
+ }
+ ip_y = ip_y - oStartY + iStartY;
+
+ real *dest_p = output_p + k*owidth*oheight + i * owidth + j;
+ real *src_p = input_p + k*iwidth*iheight + ip_y * iwidth + ip_x;
+ *dest_p = *src_p;
+ }
+ }
+ }
+}
+
+void THNN_(SpatialReflectionPadding_updateOutput)(THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ int pad_l, int pad_r,
+ int pad_t, int pad_b)
+{
+ int dimw = 2;
+ int dimh = 1;
+ int dimslices = 0;
+ long nbatch = 1;
+ long nslices;
+ long iheight;
+ long iwidth;
+ long oheight;
+ long owidth;
+ real *input_data;
+ real *output_data;
+
+ THNN_ARGCHECK(input->nDimension == 3 || input->nDimension == 4, 2, input,
+ "3D or 4D (batch mode) tensor expected for input, but got: %s");
+
+ if (input->nDimension == 4)
+ {
+ nbatch = input->size[0];
+ dimw++;
+ dimh++;
+ dimslices++;
+ }
+
+ /* sizes */
+ nslices = input->size[dimslices];
+ iheight = input->size[dimh];
+ iwidth = input->size[dimw];
+ oheight = iheight + pad_t + pad_b;
+ owidth = iwidth + pad_l + pad_r;
+
+ THArgCheck(owidth >= 1 || oheight >= 1 , 2,
+ "input (H: %d, W: %d)is too small."
+ " Calculated output H: %d W: %d",
+ iheight, iwidth, oheight, owidth);
+
+ /* get contiguous input */
+ input = THTensor_(newContiguous)(input);
+
+ /* resize output */
+ if (input->nDimension == 3)
+ {
+ THTensor_(resize3d)(output, nslices, oheight, owidth);
+
+ input_data = THTensor_(data)(input);
+ output_data = THTensor_(data)(output);
+
+ THNN_(SpatialReflectionPadding_updateOutput_frame)(input_data, output_data,
+ nslices,
+ iwidth, iheight,
+ owidth, oheight,
+ pad_l, pad_r,
+ pad_t, pad_b);
+ }
+ else
+ {
+ long p;
+
+ THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
+
+ input_data = THTensor_(data)(input);
+ output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(p)
+ for (p = 0; p < nbatch; p++)
+ {
+ THNN_(SpatialReflectionPadding_updateOutput_frame)(
+ input_data+p*nslices*iwidth*iheight,
+ output_data+p*nslices*owidth*oheight,
+ nslices,
+ iwidth, iheight,
+ owidth, oheight,
+ pad_l, pad_r,
+ pad_t, pad_b);
+ }
+ }
+
+ /* cleanup */
+ THTensor_(free)(input);
+}
+
+static void THNN_(SpatialReflectionPadding_updateGradInput_frame)(
+ real *ginput_p, real *goutput_p,
+ long nslices,
+ long iwidth, long iheight,
+ long owidth, long oheight,
+ int pad_l, int pad_r,
+ int pad_t, int pad_b)
+{
+ int iStartX = fmax(0, -pad_l);
+ int iStartY = fmax(0, -pad_t);
+ int oStartX = fmax(0, pad_l);
+ int oStartY = fmax(0, pad_t);
+
+ long k, ip_x, ip_y;
+#pragma omp parallel for private(k, ip_x, ip_y)
+
+ for (k = 0; k < nslices; k++)
+ {
+ long i, j;
+ for (i = 0; i < oheight; i++) {
+ for (j = 0; j < owidth; j++) {
+ if (j < pad_l) {
+ ip_x = pad_l * 2 - j;
+ } else if (j >= pad_l && j < iwidth + pad_l) {
+ ip_x = j;
+ } else {
+ ip_x = (iwidth + pad_l - 1) * 2 - j;
+ }
+ ip_x = ip_x - oStartX + iStartX;
+
+ if (i < pad_t) {
+ ip_y = pad_t * 2 - i;
+ } else if (i >= pad_t && i < iheight + pad_t) {
+ ip_y = i;
+ } else {
+ ip_y = (iheight + pad_t - 1) * 2 - i;
+ }
+ ip_y = ip_y - oStartY + iStartY;
+
+ real *src_p = goutput_p + k*owidth*oheight + i * owidth + j;
+ real *dest_p = ginput_p + k*iwidth*iheight + ip_y * iwidth + ip_x;
+ *dest_p += *src_p;
+ }
+ }
+ }
+}
+
+void THNN_(SpatialReflectionPadding_updateGradInput)(THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ int pad_l, int pad_r,
+ int pad_t, int pad_b)
+{
+ int dimw = 2;
+ int dimh = 1;
+ int dimslices = 0;
+ long nbatch = 1;
+ long nslices;
+ long iheight;
+ long iwidth;
+ long oheight;
+ long owidth;
+
+ if (input->nDimension == 4)
+ {
+ nbatch = input->size[0];
+ dimw++;
+ dimh++;
+ dimslices++;
+ }
+
+ /* sizes */
+ nslices = input->size[dimslices];
+ iheight = input->size[dimh];
+ iwidth = input->size[dimw];
+ oheight = iheight + pad_t + pad_b;
+ owidth = iwidth + pad_l + pad_r;
+
+ THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3,
+ "gradOutput width unexpected. Expected: %d, Got: %d",
+ owidth, THTensor_(size)(gradOutput, dimw));
+ THArgCheck(oheight == THTensor_(size)(gradOutput, dimh), 3,
+ "gradOutput height unexpected. Expected: %d, Got: %d",
+ oheight, THTensor_(size)(gradOutput, dimh));
+
+ /* get contiguous gradOutput */
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+
+ /* resize */
+ THTensor_(resizeAs)(gradInput, input);
+ THTensor_(zero)(gradInput);
+
+ /* backprop */
+ if (input->nDimension == 3) {
+ THNN_(SpatialReflectionPadding_updateGradInput_frame)(
+ THTensor_(data)(gradInput),
+ THTensor_(data)(gradOutput),
+ nslices,
+ iwidth, iheight,
+ owidth, oheight,
+ pad_l, pad_r,
+ pad_t, pad_b);
+ } else {
+ long p;
+#pragma omp parallel for private(p)
+ for (p = 0; p < nbatch; p++) {
+ THNN_(SpatialReflectionPadding_updateGradInput_frame)(
+ THTensor_(data)(gradInput) + p * nslices * iheight * iwidth,
+ THTensor_(data)(gradOutput) + p * nslices * oheight * owidth,
+ nslices,
+ iwidth, iheight,
+ owidth, oheight,
+ pad_l, pad_r,
+ pad_t, pad_b);
+ }
+ }
+
+ /* cleanup */
+ THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialReplicationPadding.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialReplicationPadding.c
new file mode 100644
index 000000000..4e318aa70
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialReplicationPadding.c
@@ -0,0 +1,260 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialReplicationPadding.c"
+#else
+
+static void THNN_(SpatialReplicationPadding_updateOutput_frame)(
+ real *input_p, real *output_p,
+ long nslices,
+ long iwidth, long iheight,
+ long owidth, long oheight,
+ int pad_l, int pad_r,
+ int pad_t, int pad_b)
+{
+ int iStartX = fmax(0, -pad_l);
+ int iStartY = fmax(0, -pad_t);
+ int oStartX = fmax(0, pad_l);
+ int oStartY = fmax(0, pad_t);
+
+ long k, ip_x, ip_y;
+#pragma omp parallel for private(k, ip_x, ip_y)
+ for (k = 0; k < nslices; k++)
+ {
+ long i, j;
+ for (i = 0; i < oheight; i++) {
+ for (j = 0; j < owidth; j++) {
+ if (j < pad_l) {
+ ip_x = pad_l;
+ } else if (j >= pad_l && j < iwidth + pad_l) {
+ ip_x = j;
+ } else {
+ ip_x = iwidth + pad_l - 1;
+ }
+ ip_x = ip_x - oStartX + iStartX;
+
+ if (i < pad_t) {
+ ip_y = pad_t;
+ } else if (i >= pad_t && i < iheight + pad_t) {
+ ip_y = i;
+ } else {
+ ip_y = iheight + pad_t - 1;
+ }
+ ip_y = ip_y - oStartY + iStartY;
+
+ real *dest_p = output_p + k*owidth*oheight + i * owidth + j;
+ real *src_p = input_p + k*iwidth*iheight + ip_y * iwidth + ip_x;
+ *dest_p = *src_p;
+ }
+ }
+ }
+}
+
+void THNN_(SpatialReplicationPadding_updateOutput)(THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ int pad_l, int pad_r,
+ int pad_t, int pad_b)
+{
+ int dimw = 2;
+ int dimh = 1;
+ int dimslices = 0;
+ long nbatch = 1;
+ long nslices;
+ long iheight;
+ long iwidth;
+ long oheight;
+ long owidth;
+ real *input_data;
+ real *output_data;
+
+ THNN_ARGCHECK(input->nDimension == 3 || input->nDimension == 4, 2, input,
+ "3D or 4D (batch mode) tensor expected for input, but got: %s");
+
+ if (input->nDimension == 4)
+ {
+ nbatch = input->size[0];
+ dimw++;
+ dimh++;
+ dimslices++;
+ }
+
+ /* sizes */
+ nslices = input->size[dimslices];
+ iheight = input->size[dimh];
+ iwidth = input->size[dimw];
+ oheight = iheight + pad_t + pad_b;
+ owidth = iwidth + pad_l + pad_r;
+
+ THArgCheck(owidth >= 1 || oheight >= 1 , 2,
+ "input (H: %d, W: %d)is too small."
+ " Calculated output H: %d W: %d",
+ iheight, iwidth, oheight, owidth);
+
+
+ /* get contiguous input */
+ input = THTensor_(newContiguous)(input);
+
+ /* resize output */
+ if (input->nDimension == 3)
+ {
+ THTensor_(resize3d)(output, nslices, oheight, owidth);
+
+ input_data = THTensor_(data)(input);
+ output_data = THTensor_(data)(output);
+
+ THNN_(SpatialReplicationPadding_updateOutput_frame)(input_data, output_data,
+ nslices,
+ iwidth, iheight,
+ owidth, oheight,
+ pad_l, pad_r,
+ pad_t, pad_b);
+ }
+ else
+ {
+ long p;
+
+ THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth);
+
+ input_data = THTensor_(data)(input);
+ output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(p)
+ for (p = 0; p < nbatch; p++)
+ {
+ THNN_(SpatialReplicationPadding_updateOutput_frame)(
+ input_data+p*nslices*iwidth*iheight,
+ output_data+p*nslices*owidth*oheight,
+ nslices,
+ iwidth, iheight,
+ owidth, oheight,
+ pad_l, pad_r,
+ pad_t, pad_b);
+ }
+ }
+
+ /* cleanup */
+ THTensor_(free)(input);
+}
+
+static void THNN_(SpatialReplicationPadding_updateGradInput_frame)(
+ real *ginput_p, real *goutput_p,
+ long nslices,
+ long iwidth, long iheight,
+ long owidth, long oheight,
+ int pad_l, int pad_r,
+ int pad_t, int pad_b)
+{
+ int iStartX = fmax(0, -pad_l);
+ int iStartY = fmax(0, -pad_t);
+ int oStartX = fmax(0, pad_l);
+ int oStartY = fmax(0, pad_t);
+
+ long k, ip_x, ip_y;
+#pragma omp parallel for private(k, ip_x, ip_y)
+ for (k = 0; k < nslices; k++)
+ {
+ long i, j;
+ for (i = 0; i < oheight; i++) {
+ for (j = 0; j < owidth; j++) {
+ if (j < pad_l) {
+ ip_x = pad_l;
+ } else if (j >= pad_l && j < iwidth + pad_l) {
+ ip_x = j;
+ } else {
+ ip_x = iwidth + pad_l - 1;
+ }
+ ip_x = ip_x - oStartX + iStartX;
+
+ if (i < pad_t) {
+ ip_y = pad_t;
+ } else if (i >= pad_t && i < iheight + pad_t) {
+ ip_y = i;
+ } else {
+ ip_y = iheight + pad_t - 1;
+ }
+ ip_y = ip_y - oStartY + iStartY;
+
+ real *src_p = goutput_p + k*owidth*oheight + i * owidth + j;
+ real *dest_p = ginput_p + k*iwidth*iheight + ip_y * iwidth + ip_x;
+ *dest_p += *src_p;
+ }
+ }
+ }
+}
+
+void THNN_(SpatialReplicationPadding_updateGradInput)(THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ int pad_l, int pad_r,
+ int pad_t, int pad_b)
+{
+ int dimw = 2;
+ int dimh = 1;
+ int dimslices = 0;
+ long nbatch = 1;
+ long nslices;
+ long iheight;
+ long iwidth;
+ long oheight;
+ long owidth;
+
+ if (input->nDimension == 4)
+ {
+ nbatch = input->size[0];
+ dimw++;
+ dimh++;
+ dimslices++;
+ }
+
+ /* sizes */
+ nslices = input->size[dimslices];
+ iheight = input->size[dimh];
+ iwidth = input->size[dimw];
+ oheight = iheight + pad_t + pad_b;
+ owidth = iwidth + pad_l + pad_r;
+
+ THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3,
+ "gradOutput width unexpected. Expected: %d, Got: %d",
+ owidth, THTensor_(size)(gradOutput, dimw));
+ THArgCheck(oheight == THTensor_(size)(gradOutput, dimh), 3,
+ "gradOutput height unexpected. Expected: %d, Got: %d",
+ oheight, THTensor_(size)(gradOutput, dimh));
+
+ /* get contiguous gradOutput */
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+
+ /* resize */
+ THTensor_(resizeAs)(gradInput, input);
+ THTensor_(zero)(gradInput);
+
+ /* backprop */
+ if (input->nDimension == 3) {
+ THNN_(SpatialReplicationPadding_updateGradInput_frame)(
+ THTensor_(data)(gradInput),
+ THTensor_(data)(gradOutput),
+ nslices,
+ iwidth, iheight,
+ owidth, oheight,
+ pad_l, pad_r,
+ pad_t, pad_b);
+ } else {
+ long p;
+#pragma omp parallel for private(p)
+ for (p = 0; p < nbatch; p++) {
+ THNN_(SpatialReplicationPadding_updateGradInput_frame)(
+ THTensor_(data)(gradInput) + p * nslices * iheight * iwidth,
+ THTensor_(data)(gradOutput) + p * nslices * oheight * owidth,
+ nslices,
+ iwidth, iheight,
+ owidth, oheight,
+ pad_l, pad_r,
+ pad_t, pad_b);
+ }
+ }
+
+ /* cleanup */
+ THTensor_(free)(gradOutput);
+}
+
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialSubSampling.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialSubSampling.c
new file mode 100644
index 000000000..4c077bc64
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialSubSampling.c
@@ -0,0 +1,302 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialSubSampling.c"
+#else
+
+static inline void THNN_(SpatialSubSampling_shapeCheck)(
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *weight,
+ int kW, int kH) {
+ int ndims = input->nDimension;
+ THNN_ARGCHECK(input->nDimension == 3 || input->nDimension == 4, 2, input,
+ "3D or 4D input tensor expected but got: %s");
+ THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous");
+
+ int nInputPlane = THTensor_(size)(weight, 0);
+
+ int dimw = 2;
+ int dimh = 1;
+
+ long inputWidth;
+ long inputHeight;
+
+ if (input->nDimension == 4) {
+ dimw++;
+ dimh++;
+ }
+
+ inputWidth = input->size[dimw];
+ inputHeight = input->size[dimh];
+
+ THArgCheck(input->size[dimh-1] == nInputPlane, 2, "invalid number of input planes");
+ THArgCheck(inputWidth >= kW && inputHeight >= kH, 2, "input image smaller than kernel size");
+}
+
+void THNN_(SpatialSubSampling_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THTensor *weight,
+ THTensor *bias,
+ int kW, int kH,
+ int dW, int dH)
+{
+ THArgCheck(!bias || THTensor_(isContiguous)(bias), 5, "bias must be contiguous");
+
+ real *weight_data = THTensor_(data)(weight);
+ real *bias_data = THTensor_(data)(bias);
+ real *output_data;
+ real *input_data;
+
+ int dimw = 2;
+ int dimh = 1;
+ long nbatch = 1;
+
+ long inputWidth;
+ long inputHeight;
+ long outputWidth;
+ long outputHeight;
+
+ int nInputPlane = THTensor_(size)(weight,0);
+
+ long k;
+
+ THNN_(SpatialSubSampling_shapeCheck)(input, NULL, weight, kW, kH);
+
+ if (input->nDimension == 4) {
+ nbatch = input->size[0];
+ dimw++;
+ dimh++;
+ }
+
+ inputWidth = input->size[dimw];
+ inputHeight = input->size[dimh];
+ outputWidth = (inputWidth - kW) / dW + 1;
+ outputHeight = (inputHeight - kH) / dH + 1;
+
+ if (input->nDimension == 3)
+ THTensor_(resize3d)(output, nInputPlane, outputHeight, outputWidth);
+ else
+ THTensor_(resize4d)(output, input->size[0], nInputPlane, outputHeight, outputWidth);
+
+ input = THTensor_(newContiguous)(input);
+ input_data = THTensor_(data)(input);
+ output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(k)
+ for(k = 0; k < nInputPlane; k++)
+ {
+ long p;
+ for(p = 0; p < nbatch; p++)
+ {
+ long xx, yy;
+ /* For all output pixels... */
+ real *ptr_output = output_data + p*nInputPlane*outputWidth*outputHeight + k*outputWidth*outputHeight;
+ /* Get the good mask for (k,i) (k out, i in) */
+ real the_weight = weight_data[k];
+ /* Initialize to the bias */
+ real z = bias_data[k];
+ long i;
+ for(i = 0; i < outputWidth*outputHeight; i++)
+ ptr_output[i] = z;
+
+ for(yy = 0; yy < outputHeight; yy++)
+ {
+ for(xx = 0; xx < outputWidth; xx++)
+ {
+ /* Compute the mean of the input image... */
+ real *ptr_input = input_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight + yy*dH*inputWidth+xx*dW;
+ real sum = 0;
+ long kx, ky;
+
+ for(ky = 0; ky < kH; ky++)
+ {
+ for(kx = 0; kx < kW; kx++)
+ sum += ptr_input[kx];
+ ptr_input += inputWidth; /* next input line */
+ }
+ /* Update output */
+ *ptr_output++ += the_weight*sum;
+ }
+ }
+ }
+ }
+ THTensor_(free)(input);
+}
+
+void THNN_(SpatialSubSampling_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *weight,
+ int kW, int kH,
+ int dW, int dH)
+{
+ THNN_(SpatialSubSampling_shapeCheck)(input, gradOutput, weight, kW, kH);
+
+ int dimw = 2;
+ int dimh = 1;
+ long nbatch = 1;
+
+ long inputWidth;
+ long inputHeight;
+ long outputWidth;
+ long outputHeight;
+
+ int nInputPlane = THTensor_(size)(weight,0);
+
+ real *weight_data;
+ real *gradOutput_data;
+ real *input_data, *gradInput_data;
+
+ long k;
+
+ if (input->nDimension == 4) {
+ nbatch = input->size[0];
+ dimw++;
+ dimh++;
+ }
+
+ inputWidth = input->size[dimw];
+ inputHeight = input->size[dimh];
+ outputWidth = (inputWidth - kW) / dW + 1;
+ outputHeight = (inputHeight - kH) / dH + 1;
+
+ weight_data = THTensor_(data)(weight);
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+ gradOutput_data = THTensor_(data)(gradOutput);
+
+ input_data = THTensor_(data)(input);
+
+ THTensor_(resizeAs)(gradInput, input);
+ gradInput_data = THTensor_(data)(gradInput);
+
+#pragma omp parallel for private(k)
+ for(k = 0; k < nInputPlane; k++)
+ {
+ long p;
+ for(p = 0; p < nbatch; p++)
+ {
+ real the_weight = weight_data[k];
+ real *ptr_gradOutput = gradOutput_data + p*nInputPlane*outputHeight*outputWidth + k*outputWidth*outputHeight;
+ long xx, yy;
+
+ real* ptr_gi = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight;
+ long i;
+ for(i=0; i<inputWidth*inputHeight; i++)
+ ptr_gi[i] = 0.0;
+
+ for(yy = 0; yy < outputHeight; yy++)
+ {
+ for(xx = 0; xx < outputWidth; xx++)
+ {
+ real *ptr_gradInput = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight + yy*dH*inputWidth+xx*dW;
+ real z = *ptr_gradOutput++ * the_weight;
+ long kx, ky;
+
+ for(ky = 0; ky < kH; ky++)
+ {
+ for(kx = 0; kx < kW; kx++)
+ ptr_gradInput[kx] += z;
+ ptr_gradInput += inputWidth;
+ }
+ }
+ }
+ }
+ }
+ THTensor_(free)(gradOutput);
+}
+
+void THNN_(SpatialSubSampling_accGradParameters)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradWeight,
+ THTensor *gradBias,
+ int kW, int kH,
+ int dW, int dH,
+ accreal scale_)
+{
+ real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+ THNN_(SpatialSubSampling_shapeCheck)(input, gradOutput, gradWeight, kW, kH);
+
+ long nbatch = 1;
+ long dimw = 2;
+ long dimh = 1;
+
+ long inputWidth;
+ long inputHeight;
+ long outputWidth;
+ long outputHeight;
+
+ int nInputPlane = THTensor_(size)(gradWeight,0);
+
+ real *gradWeight_data;
+ real *gradBias_data;
+ real *gradOutput_data;
+ real *input_data;
+
+ long k;
+
+ if (input->nDimension == 4) {
+ dimw++;
+ dimh++;
+ nbatch = input->size[0];
+ }
+
+ inputWidth = input->size[dimw];
+ inputHeight = input->size[dimh];
+ outputWidth = (inputWidth - kW) / dW + 1;
+ outputHeight = (inputHeight - kH) / dH + 1;
+
+ gradWeight_data = THTensor_(data)(gradWeight);
+ gradBias_data = THTensor_(data)(gradBias);
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+ gradOutput_data = THTensor_(data)(gradOutput);
+
+ input = THTensor_(newContiguous)(input);
+ input_data = THTensor_(data)(input);
+
+#pragma omp parallel for private(k)
+ for(k = 0; k < nInputPlane; k++)
+ {
+ long p;
+ for(p = 0; p < nbatch; p++)
+ {
+ real *ptr_gradOutput = gradOutput_data + p*nInputPlane*outputHeight*outputWidth + k*outputWidth*outputHeight;
+ real sum;
+ long xx, yy;
+ long i;
+
+ sum = 0;
+ for(i = 0; i < outputWidth*outputHeight; i++)
+ sum += ptr_gradOutput[i];
+ gradBias_data[k] += scale*sum;
+
+ sum = 0;
+ for(yy = 0; yy < outputHeight; yy++)
+ {
+ for(xx = 0; xx < outputWidth; xx++)
+ {
+ real *ptr_input = input_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight + yy*dH*inputWidth+xx*dW;
+ real z = *ptr_gradOutput++;
+ long kx, ky;
+
+ for(ky = 0; ky < kH; ky++)
+ {
+ for(kx = 0; kx < kW; kx++)
+ sum += z * ptr_input[kx];
+ ptr_input += inputWidth;
+ }
+ }
+ }
+ gradWeight_data[k] += scale*sum;
+ }
+ }
+
+ THTensor_(free)(input);
+ THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialUpSamplingBilinear.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialUpSamplingBilinear.c
new file mode 100644
index 000000000..8bc487ead
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialUpSamplingBilinear.c
@@ -0,0 +1,174 @@
+// Adapted from interp.cpp from Caffe util by Pauline Luc
+// Originally developed by George Papandreou
+
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialUpSamplingBilinear.c"
+#else
+
+static inline void THNN_(SpatialUpSamplingBilinear_shapeCheck)
+ (THTensor *input, THTensor *gradOutput,
+ int nBatch, int nChannels,
+ int inputHeight, int inputWidth,
+ int outputHeight, int outputWidth) {
+ THArgCheck(inputHeight > 0 && inputWidth > 0
+ && outputHeight > 0 && outputWidth > 0, 2,
+ "input and output sizes should be greater than 0,"
+ " but got input (H: %d, W: %d) output (H: %d, W: %d)",
+ inputHeight, inputWidth, outputHeight, outputWidth);
+ if (input != NULL) {
+ THNN_ARGCHECK(input->nDimension == 4, 2, input,
+ "4D input tensor expected but got: %s");
+ }
+
+ if (gradOutput != NULL) {
+ THNN_CHECK_DIM_SIZE(gradOutput, 4, 0, nBatch);
+ THNN_CHECK_DIM_SIZE(gradOutput, 4, 1, nChannels);
+ THNN_CHECK_DIM_SIZE(gradOutput, 4, 2, outputHeight);
+ THNN_CHECK_DIM_SIZE(gradOutput, 4, 3, outputWidth);
+ }
+}
+
+void THNN_(SpatialUpSamplingBilinear_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ int outputHeight,
+ int outputWidth){
+
+ int nbatch = THTensor_(size)(input, 0);
+ int channels = THTensor_(size)(input, 1);
+ int inputHeight = THTensor_(size)(input, 2);
+ int inputWidth = THTensor_(size)(input, 3);
+
+ THNN_(SpatialUpSamplingBilinear_shapeCheck)
+ (input, NULL,
+ nbatch, channels,
+ inputHeight, inputWidth,
+ outputHeight, outputWidth);
+
+ input = THTensor_(newContiguous)(input);
+ THTensor_(resize4d)(output,
+ THTensor_(size)(input, 0),
+ THTensor_(size)(input, 1),
+ outputHeight, outputWidth);
+ THTensor_(zero)(output);
+ real *idata = THTensor_(data)(input);
+ real *odata = THTensor_(data)(output);
+ channels = nbatch * channels;
+ THAssert(inputHeight > 0 && inputWidth > 0 && outputHeight > 0 && outputWidth > 0);
+ // special case: just copy
+ if (inputHeight == outputHeight && inputWidth == outputWidth) {
+ for (int h2 = 0; h2 < outputHeight; ++h2) {
+ const int h1 = h2;
+ for (int w2 = 0; w2 < outputWidth; ++w2) {
+ const int w1 = w2;
+ const real* pos1 = &idata[h1 * inputWidth + w1];
+ real* pos2 = &odata[h2 * outputWidth + w2];
+ for (int c = 0; c < channels; ++c) {
+ pos2[0] = pos1[0];
+ pos1 += inputWidth * inputHeight;
+ pos2 += outputWidth * outputHeight;
+ }
+ }
+ }
+ return;
+ }
+ const float rheight =(outputHeight > 1) ? (float)(inputHeight - 1)/(outputHeight - 1) : 0.f;
+ const float rwidth = (outputWidth > 1) ? (float)(inputWidth - 1) / (outputWidth - 1) : 0.f;
+ for (int h2 = 0; h2 < outputHeight; ++h2) {
+ const float h1r = rheight * h2;
+ const int h1 = h1r;
+ const int h1p = (h1 < inputHeight - 1) ? 1 : 0;
+ const real h1lambda = h1r - h1;
+ const real h0lambda = (real)1. - h1lambda;
+ for (int w2 = 0; w2 < outputWidth; ++w2) {
+ const float w1r = rwidth * w2;
+ const int w1 = w1r;
+ const int w1p = (w1 < inputWidth - 1) ? 1 : 0;
+ const real w1lambda = w1r - w1;
+ const real w0lambda = (real)1. - w1lambda;
+ const real* pos1 = &idata[h1 * inputWidth + w1];
+ real* pos2 = &odata[h2 * outputWidth + w2];
+ for (int c = 0; c < channels; ++c) {
+ pos2[0] = h0lambda * (w0lambda * pos1[0]+ w1lambda * pos1[w1p])
+ + h1lambda * (w0lambda * pos1[h1p * inputWidth]
+ + w1lambda * pos1[h1p * inputWidth + w1p]);
+ pos1 += inputWidth * inputHeight;
+ pos2 += outputWidth * outputHeight;
+ }
+ }
+ }
+ THTensor_(free)(input);
+}
+
+void THNN_(SpatialUpSamplingBilinear_updateGradInput)(
+ THNNState *state,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ int nbatch,
+ int channels,
+ int inputHeight,
+ int inputWidth,
+ int outputHeight,
+ int outputWidth){
+
+ THNN_(SpatialUpSamplingBilinear_shapeCheck)
+ (NULL, gradOutput,
+ nbatch, channels,
+ inputHeight, inputWidth,
+ outputHeight, outputWidth);
+
+ THTensor_(resize4d)(gradInput, nbatch, channels, inputHeight, inputWidth);
+ THTensor_(zero)(gradInput);
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+ real *data1 = THTensor_(data)(gradInput);
+ real *data2 = THTensor_(data)(gradOutput);
+ channels = nbatch * channels;
+
+ // special case: same-size matching grids
+ if (inputHeight == outputHeight && inputWidth == outputWidth) {
+ for (int h2 = 0; h2 < outputHeight; ++h2) {
+ const int h1 = h2;
+ for (int w2 = 0; w2 < outputWidth; ++w2) {
+ const int w1 = w2;
+ real* pos1 = &data1[h1 * inputWidth + w1];
+ const real* pos2 = &data2[h2 * outputWidth + w2];
+ for (int c = 0; c < channels; ++c) {
+ pos1[0] += pos2[0];
+ pos1 += inputWidth * inputHeight;
+ pos2 += outputWidth * outputHeight;
+ }
+ }
+ }
+ return;
+ }
+ const float rheight =(outputHeight > 1) ? (float)(inputHeight - 1)/(outputHeight - 1) : 0.f;
+ const float rwidth = (outputWidth > 1) ? (float)(inputWidth - 1)/(outputWidth - 1) : 0.f;
+ for (int h2 = 0; h2 < outputHeight; ++h2) {
+ const float h1r = rheight * h2;
+ const int h1 = h1r;
+ const int h1p = (h1 < inputHeight - 1) ? 1 : 0;
+ const real h1lambda = h1r - h1;
+ const real h0lambda = (real)1. - h1lambda;
+ for (int w2 = 0; w2 < outputWidth; ++w2) {
+ const float w1r = rwidth * w2;
+ const int w1 = w1r;
+ const int w1p = (w1 < inputWidth - 1) ? 1 : 0;
+ const real w1lambda = w1r - w1;
+ const real w0lambda = (real)1. - w1lambda;
+ real* pos1 = &data1[h1 * inputWidth + w1];
+ const real* pos2 = &data2[h2 * outputWidth + w2];
+ for (int c = 0; c < channels; ++c) {
+ pos1[0] += h0lambda * w0lambda * pos2[0];
+ pos1[w1p] += h0lambda * w1lambda * pos2[0];
+ pos1[h1p * inputWidth] += h1lambda * w0lambda * pos2[0];
+ pos1[h1p * inputWidth + w1p] += h1lambda * w1lambda * pos2[0];
+ pos1 += inputWidth * inputHeight;
+ pos2 += outputWidth * outputHeight;
+ }
+ }
+ }
+ THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialUpSamplingNearest.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialUpSamplingNearest.c
new file mode 100644
index 000000000..b4699ff3e
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialUpSamplingNearest.c
@@ -0,0 +1,199 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/SpatialUpSamplingNearest.c"
+#else
+
+
+static inline void THNN_(SpatialUpSamplingNearest_shapeCheck)
+ (THTensor *input, THTensor *gradOutput,
+ int scale_factor) {
+ THArgCheck(input != NULL, 2, "4D input tensor expected but got NULL");
+ THArgCheck(scale_factor > 1, 4,
+ "scale_factor must be greater than 1, but got: %d", scale_factor);
+ THNN_ARGCHECK(input->nDimension == 3 || input->nDimension == 4, 2, input,
+ "3D or 4D input tensor expected but got: %s");
+ if (input->nDimension == 3) {
+ int nChannels = THTensor_(size)(input, 0);
+ int inputHeight = THTensor_(size)(input, 1);
+ int inputWidth = THTensor_(size)(input, 2);
+ int outputHeight = inputHeight * scale_factor;
+ int outputWidth = inputWidth * scale_factor;
+ if (gradOutput != NULL) {
+ THNN_CHECK_DIM_SIZE(gradOutput, 3, 0, nChannels);
+ THNN_CHECK_DIM_SIZE(gradOutput, 3, 1, outputHeight);
+ THNN_CHECK_DIM_SIZE(gradOutput, 3, 2, outputWidth);
+ }
+ } else {
+ int nBatch = THTensor_(size)(input, 0);
+ int nChannels = THTensor_(size)(input, 1);
+ int inputHeight = THTensor_(size)(input, 2);
+ int inputWidth = THTensor_(size)(input, 3);
+ int outputHeight = inputHeight * scale_factor;
+ int outputWidth = inputWidth * scale_factor;
+ if (gradOutput != NULL) {
+ THNN_CHECK_DIM_SIZE(gradOutput, 4, 0, nBatch);
+ THNN_CHECK_DIM_SIZE(gradOutput, 4, 1, nChannels);
+ THNN_CHECK_DIM_SIZE(gradOutput, 4, 2, outputHeight);
+ THNN_CHECK_DIM_SIZE(gradOutput, 4, 3, outputWidth);
+ }
+ }
+}
+
+void THNN_(SpatialUpSamplingNearest_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ int scale_factor)
+{
+ THNN_(SpatialUpSamplingNearest_shapeCheck)(input, NULL, scale_factor);
+ int inputHeight = THTensor_(size)(input, input->nDimension-2);
+ int inputWidth = THTensor_(size)(input, input->nDimension-1);
+ int outputHeight = inputHeight * scale_factor;
+ int outputWidth = inputWidth * scale_factor;
+
+ if (input->nDimension == 3) {
+ THTensor_(resize3d)(output,
+ THTensor_(size)(input, 0),
+ outputHeight, outputWidth);
+ } else {
+ THTensor_(resize4d)(output,
+ THTensor_(size)(input, 0),
+ THTensor_(size)(input, 1),
+ outputHeight, outputWidth);
+ }
+
+ int dW = scale_factor;
+ int dH = scale_factor;
+ int xDim = input->nDimension-2;
+ int yDim = input->nDimension-1;
+
+ // dims
+ int idim = input->nDimension;
+ int osz0 = output->size[0];
+ int osz1 = output->size[1];
+ int osz2 = output->size[2];
+ int osz3 = 1;
+ if (idim > 3) {
+ osz3 = output->size[3];
+ }
+
+ // get strides
+ long *is = input->stride;
+ long *os = output->stride;
+
+ // get raw pointers
+ real *pin = THTensor_(data)(input);
+ real *pout = THTensor_(data)(output);
+
+ // perform the upsampling
+ int i0, i1, i2, i3, isrc, idst;
+ int iout[4]; // Output indices
+ int iin[4]; // Input indices
+
+ for (i0 = 0; i0 < osz0; i0++) {
+ iout[0] = i0;
+ iin[0] = i0;
+ for (i1 = 0; i1 < osz1; i1++) {
+ iout[1] = i1;
+ iin[1] = i1;
+ for (i2 = 0; i2 < osz2; i2++) {
+ iout[2] = i2;
+ iin[2] = i2;
+ for (i3 = 0; i3 < osz3; i3++) {
+ iout[3] = i3;
+ iin[3] = i3;
+
+ // set the indices for the upsampled dimensions
+ iin[xDim] = iout[xDim] / dW;
+ iin[yDim] = iout[yDim] / dH;
+
+ idst = i0*os[0] + i1*os[1] + i2*os[2];
+ isrc = iin[0]*is[0] + iin[1]*is[1] + iin[2]*is[2];
+ if (idim > 3) {
+ idst += i3*os[3];
+ isrc += iin[3]*is[3];
+ }
+
+ pout[idst] = pin[isrc];
+ }
+ }
+ }
+ }
+}
+
+void THNN_(SpatialUpSamplingNearest_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ int scale_factor)
+{
+ THNN_(SpatialUpSamplingNearest_shapeCheck)(input, gradOutput, scale_factor);
+ THTensor_(resizeAs)(gradInput, input);
+
+ int dW = scale_factor;
+ int dH = scale_factor;
+ int xDim = gradInput->nDimension-2;
+ int yDim = gradInput->nDimension-1;
+
+ // dims
+ int idim = gradInput->nDimension; // Guaranteed to be between 3 and 5
+ int isz0 = gradInput->size[0];
+ int isz1 = gradInput->size[1];
+ int isz2 = gradInput->size[2];
+ int isz3 = 1;
+ if (idim > 3) {
+ isz3 = gradInput->size[3];
+ }
+
+ // get strides
+ long *is = gradInput->stride;
+ long *os = gradOutput->stride;
+
+ // get raw pointers
+ real *pin = THTensor_(data)(gradInput);
+ real *pout = THTensor_(data)(gradOutput);
+
+ // perform the upsampling
+ int i0, i1, i2, i3, isrc, idst, x, y;
+ int iin[4]; // Input indices
+ int iout[4]; // Output indices
+
+ THTensor_(zero)(gradInput);
+
+ for (i0 = 0; i0 < isz0; i0++) {
+ iin[0] = i0;
+ iout[0] = i0;
+ for (i1 = 0; i1 < isz1; i1++) {
+ iin[1] = i1;
+ iout[1] = i1;
+ for (i2 = 0; i2 < isz2; i2++) {
+ iin[2] = i2;
+ iout[2] = i2;
+ for (i3 = 0; i3 < isz3; i3++) {
+ iin[3] = i3;
+ iout[3] = i3;
+
+ idst = i0*is[0] + i1*is[1] + i2*is[2];
+ if (idim > 3) {
+ idst += i3*is[3];
+ }
+
+ // Now accumulate the gradients from gradOutput
+ for (y = 0; y < dH; y++) {
+ for (x = 0; x < dW; x++) {
+ iout[xDim] = dW * iin[xDim] + x;
+ iout[yDim] = dH * iin[yDim] + y;
+ isrc = iout[0]*os[0] + iout[1]*os[1] + iout[2]*os[2];
+ if (idim > 3) {
+ isrc += iout[3]*os[3];
+ }
+ pin[idst] += pout[isrc];
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/Sqrt.c b/contrib/lua-torch/nn/lib/THNN/generic/Sqrt.c
new file mode 100644
index 000000000..174884e34
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/Sqrt.c
@@ -0,0 +1,52 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Sqrt.c"
+#else
+
+void THNN_(Sqrt_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ accreal eps_)
+{
+ real eps = TH_CONVERT_ACCREAL_TO_REAL(eps_);
+ THTensor_(resizeAs)(output, input);
+ THTensor_(sqrt)(output, input);
+}
+
+void THNN_(Sqrt_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *output)
+{
+ THNN_CHECK_SHAPE(output, gradOutput);
+ THTensor_(resizeAs)(gradInput, input);
+
+ if (output->nDimension == 1 ||
+ !THTensor_(isContiguous)(output) ||
+ !THTensor_(isContiguous)(gradOutput) ||
+ !THTensor_(isContiguous)(gradInput))
+ {
+ TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
+ *gradInput_data = (*output_data == 0.0) ? 0.0 : (0.5 * (*gradOutput_data / *output_data));
+ );
+ }
+ else
+ {
+ real *gradOutput_data = THTensor_(data)(gradOutput);
+ real *gradInput_data = THTensor_(data)(gradInput);
+ real *output_data = THTensor_(data)(output);
+ long i;
+#pragma omp parallel for private(i)
+ for(i = 0; i < THTensor_(nElement)(output); i++)
+ {
+ if (output_data[i] == 0.0)
+ gradInput_data[i] = 0.0;
+ else
+ gradInput_data[i] = 0.5 * (gradOutput_data[i] / output_data[i]);
+ }
+ }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/Square.c b/contrib/lua-torch/nn/lib/THNN/generic/Square.c
new file mode 100644
index 000000000..aad0a911c
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/Square.c
@@ -0,0 +1,59 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Square.c"
+#else
+
+void THNN_(Square_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output)
+{
+ THTensor_(resizeAs)(output, input);
+
+ if (input->nDimension == 1 || !THTensor_(isContiguous)(input) || !THTensor_(isContiguous)(output))
+ {
+ TH_TENSOR_APPLY2(real, output, real, input,
+ *output_data = (*input_data) * (*input_data);
+ );
+ }
+ else
+ {
+ real *output_data = THTensor_(data)(output);
+ real *input_data = THTensor_(data)(input);
+ long i;
+#pragma omp parallel for private(i)
+ for (i = 0; i < THTensor_(nElement)(input); i++)
+ output_data[i] = input_data[i]*input_data[i];
+ }
+}
+
+void THNN_(Square_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput)
+{
+ THNN_CHECK_SHAPE(input, gradOutput);
+ THTensor_(resizeAs)(gradInput, input);
+
+ if (input->nDimension == 1 ||
+ !THTensor_(isContiguous)(input) ||
+ !THTensor_(isContiguous)(gradOutput) ||
+ !THTensor_(isContiguous)(gradInput))
+ {
+ TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+ *gradInput_data = 2.0 * (*gradOutput_data) * (*input_data);
+ );
+ }
+ else
+ {
+ real *gradOutput_data = THTensor_(data)(gradOutput);
+ real *gradInput_data = THTensor_(data)(gradInput);
+ real *input_data = THTensor_(data)(input);
+ long i;
+#pragma omp parallel for private(i)
+ for (i = 0; i < THTensor_(nElement)(gradInput); i++)
+ gradInput_data[i] = 2.0 * gradOutput_data[i] * input_data[i];
+ }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/THNN.h b/contrib/lua-torch/nn/lib/THNN/generic/THNN.h
new file mode 100644
index 000000000..76a28eb2d
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/THNN.h
@@ -0,0 +1,1501 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/THNN.h"
+#else
+
+TH_API void THNN_(Abs_updateOutput)(
+ THNNState *state, // library's state
+ THTensor *input, // input tensor
+ THTensor *output); // [OUT] Abs output
+TH_API void THNN_(Abs_updateGradInput)(
+ THNNState *state, // library's state
+ THTensor *input, // input tensor
+ THTensor *gradOutput, // gradient w.r.t. output
+ THTensor *gradInput); // [OUT] gradient w.r.t. input
+
+TH_API void THNN_(AbsCriterion_updateOutput)(
+ THNNState *state, // library's state
+ THTensor *input, // input tensor
+ THTensor *target, // tensor with target values
+ THTensor *output, // [OUT] a one-element tensor with loss
+ bool sizeAverage); // if true, the loss will be divided by batch size
+TH_API void THNN_(AbsCriterion_updateGradInput)(
+ THNNState *state, // library's state
+ THTensor *input, // input tensor
+ THTensor *target, // tensor with target values
+ THTensor *gradInput, // [OUT] gradient w.r.t. input
+ bool sizeAverage); // if true, the gradient will be normalized by batch size
+
+TH_API void THNN_(BCECriterion_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *target,
+ THTensor *output,
+ bool sizeAverage,
+ THTensor *weights); // [OPTIONAL]
+TH_API void THNN_(BCECriterion_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *target,
+ THTensor *gradInput,
+ bool sizeAverage,
+ THTensor *weights); // [OPTIONAL]
+
+TH_API void THNN_(ClassNLLCriterion_updateOutput)(
+ THNNState *state, // library's state
+ THTensor *input, // input tensor (1D/2D)
+ THIndexTensor *target, // tensor containing indexes of target classes
+ THTensor *output, // [OUT] a one-element tensor with loss
+ bool sizeAverage, // if true, the loss will be normalized by batch size and class weights
+ THTensor *weights, // [OPTIONAL] class weights
+ THTensor *total_weight, // [BUFFER]
+ long ignore_index); // target index to ignore (loss = 0, gradInput = 0)
+TH_API void THNN_(ClassNLLCriterion_updateGradInput)(
+ THNNState *state, // library's state
+ THTensor *input, // input tensor (1D/2D)
+ THIndexTensor *target, // tensor containing indexes of target classes
+ THTensor *gradInput, // [OUT] gradient w.r.t. input
+ bool sizeAverage, // if true, the loss will be normalized by batch size and class weights
+ THTensor *weights, // [OPTIONAL] class weights
+ THTensor *total_weight, // [BUFFER]
+ long ignore_index); // target index to ignore (loss = 0, gradInput = 0)
+
+TH_API void THNN_(SpatialClassNLLCriterion_updateOutput)(
+ THNNState *state, // library's state
+ THTensor *input, // input tensor (4D)
+ THIndexTensor *target, // tensor containing indexes of target classes (3D)
+ THTensor *output, // [OUT] a one-element tensor with loss
+ bool sizeAverage, // if true, the loss will be normalized by batch size and class weights
+ THTensor *weights, // [OPTIONAL] class weights
+ THTensor *total_weight); // [BUFFER]
+TH_API void THNN_(SpatialClassNLLCriterion_updateGradInput)(
+ THNNState *state, // library's state
+ THTensor *input, // input tensor (4D)
+ THIndexTensor *target, // tensor containing indexes of target classes (3D)
+ THTensor *gradInput, // [OUT] gradient w.r.t. input
+ bool sizeAverage, // if true, the loss will be normalized by batch size and class weights
+ THTensor *weights, // [OPTIONAL] class weights
+ THTensor *total_weight); // [BUFFER]
+
+TH_API void THNN_(ELU_updateOutput)(
+ THNNState *state, // library's state
+ THTensor *input, // input tensor
+ THTensor *output, // [OUT] ELU output
+ accreal alpha, // an ELU parameter (as in paper)
+ bool inplace); // if true, modifies gradOutput and sets gradInput onto it (no additional memory is allocated)
+TH_API void THNN_(ELU_updateGradInput)(
+ THNNState *state, // library's state
+ THTensor *input, // input tensor
+ THTensor *gradOutput, // gradient w.r.t. output
+ THTensor *gradInput, // [OUT] gradient w.r.t. input
+ THTensor *output, // output from a forward pass
+ accreal alpha, // an ELU parameter (as in paper)
+ bool inplace); // if true, modifies gradOutput and sets gradInput onto it (no additional memory is allocated)
+
+TH_API void THNN_(DistKLDivCriterion_updateOutput)(
+ THNNState *state, // library's state
+ THTensor *input, // input tensor
+ THTensor *target, // target tensor
+ THTensor *output, // [OUT] a one-element tensor containing the loss
+ bool sizeAverage); // if true, the loss will be normalized **by total number of elements**
+TH_API void THNN_(DistKLDivCriterion_updateGradInput)(
+ THNNState *state, // library's state
+ THTensor *input, // input tensor
+ THTensor *target, // target tensor
+ THTensor *gradInput, // [OUT] gradient w.r.t. input
+ bool sizeAverage); // if true, the loss will be normalized **by total number of elements**
+
+TH_API void THNN_(GatedLinear_updateOutput)(
+ THNNState *state, // library's state
+ THTensor *input, // input tensor
+ THTensor *output, // [OUT] output tensor, half size of input along dimension dim
+ int dim); // dimension for halving operation
+TH_API void THNN_(GatedLinear_updateGradInput)(
+ THNNState *state, // library's state
+ THTensor *input, // input tensor
+ THTensor *gradOutput, // gradient w.r.t module's output
+ THTensor *gradInput, // [OUT] gradient w.r.t input
+ int dim); // dimension for halving operation
+
+// HardShink outputs 0 on interval of (-lambda; lambda) or original value otherwise.
+TH_API void THNN_(HardShrink_updateOutput)(
+ THNNState *state, // library's state
+ THTensor *input, // input tensor
+ THTensor *output, // [OUT] output tensor
+ accreal lambda); // HardShrink parameter
+TH_API void THNN_(HardShrink_updateGradInput)(
+ THNNState *state, // library's state
+ THTensor *input, // input tensor
+ THTensor *gradOutput, // gradient w.r.t. module's output
+ THTensor *gradInput, // [OUT] gradient w.r.t. input
+ accreal lambda); // HardShrink parameter
+
+// HardTanh clamps the values to the interval [min_val; max_val].
+TH_API void THNN_(HardTanh_updateOutput)(
+ THNNState *state, // library's state
+ THTensor *input, // input tensor
+ THTensor *output, // [OUT] output tensor
+ accreal min_val, // lower threshold
+ accreal max_val, // upper threshold
+ bool inplace);
+TH_API void THNN_(HardTanh_updateGradInput)(
+ THNNState *state, // library's state
+ THTensor *input, // input tensor
+ THTensor *gradOutput, // gradient w.r.t. module's output
+ THTensor *gradInput, // [OUT] gradient w.r.t. the input
+ accreal min_val, // lower threshold
+ accreal max_val, // upper threshold
+ bool inplace);
+
+TH_API void THNN_(L1Cost_updateOutput)(
+ THNNState *state, // library's state
+ THTensor *input, // input tensor
+ THTensor *output); // [OUT] output tensor
+TH_API void THNN_(L1Cost_updateGradInput)(
+ THNNState *state, // library's state
+ THTensor *input, // input tensor
+ THTensor *gradOutput, // [OPTIONAL] gradient w.r.t module's output
+ THTensor *gradInput); // [OUT] gradient w.r.t the input
+
+TH_API void THNN_(LeakyReLU_updateOutput)(
+ THNNState *state, // library's state
+ THTensor *input, // [MODIFIED] input tensor
+ THTensor *output, // [OUT] output tensor
+ accreal negval, // negative part slope
+ bool inplace); // if true, modifies the input tensor and sets the output tensor on it (no additional memory is allocated)
+TH_API void THNN_(LeakyReLU_updateGradInput)(
+ THNNState *state, // library's state
+ THTensor *input, // input tensor
+ THTensor *gradOutput, // [MODIFIED] gradient w.r.t. module's output
+ THTensor *gradInput, // [OUT] gradient w.r.t. the input
+ accreal negval, // negative part slope
+ bool inplace); // if true, modifies gradOutput and sets gradInput onto it (no additional memory is allocated)
+
+TH_API void THNN_(GRUFused_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *hidden,
+ THTensor *bias1, // [OPTIONAL]
+ THTensor *bias2, // [OPTIONAL]
+ THTensor *hx,
+ THTensor *output,
+ THTensor *storage);
+TH_API void THNN_(GRUFused_updateGradInput)(
+ THNNState *state,
+ THTensor *gradInInput,
+ THTensor *gradInHidden,
+ THTensor *gradOutput,
+ THTensor *gradInputHx,
+ THTensor *storage);
+
+TH_API void THNN_(LSTMFused_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *hidden,
+ THTensor *bias1, // [OPTIONAL]
+ THTensor *bias2, // [OPTIONAL]
+ THTensor *cell,
+ THTensor *output,
+ THTensor *outputCell);
+TH_API void THNN_(LSTMFused_updateGradInput)(
+ THNNState *state,
+ THTensor *storage,
+ THTensor *gradInGates,
+ THTensor *cx,
+ THTensor *cy,
+ THTensor *gradOutput,
+ THTensor *gradOutputCell,
+ THTensor *gradInputCx);
+
+TH_API void THNN_(LogSigmoid_updateOutput)(
+ THNNState *state, // library's state
+ THTensor *input, // input tensor
+ THTensor *output, // output tensor
+ THTensor *buffer); // [BUFFER]
+TH_API void THNN_(LogSigmoid_updateGradInput)(
+ THNNState *state, // library's state
+ THTensor *input, // input
+ THTensor *gradOutput, // gradient w.r.t. module's output
+ THTensor *gradInput, // [OUT] gradient w.r.t. input
+ THTensor *buffer); // [BUFFER]
+
+TH_API void THNN_(LogSoftMax_updateOutput)(
+ THNNState *state, // library's state
+ THTensor *input, // input tensor
+ THTensor *output); // [OUT] output tensor
+TH_API void THNN_(LogSoftMax_updateGradInput)(
+ THNNState *state, // library's state
+ THTensor *input, // input tensor
+ THTensor *gradOutput, // gradient w.r.t. module's output
+ THTensor *gradInput, // [OUT] gradient w.r.t. input
+ THTensor *output); // module's output
+
+TH_API void THNN_(LookupTable_accGradParameters)(
+ THNNState *state,
+ THIndexTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradWeight,
+ THIntegerTensor *count,
+ THTensor *sorted, // [OPTIONAL]
+ THIndexTensor *indices, // [OPTIONAL]
+ bool scaleGradByFreq,
+ int paddingValue,
+ accreal scale);
+
+TH_API void THNN_(LookupTable_renorm)(
+ THNNState *state, // library's state
+ THIndexTensor *idx, // vector containing row indices (modified in function)
+ THTensor *weight, // 2D tensor whose rows will be renormalized
+ accreal maxNorm, // maximum norm
+ accreal normType); // the norm type (e.g., normType=2, then it's 2-norm)
+
+TH_API void THNN_(MarginCriterion_updateOutput)(
+ THNNState *state, // library's state
+ THTensor *input, // input tensor
+ THTensor *target, // target tensor (should contain only 1s and -1s)
+ THTensor *output, // [OUT] a one-element tensor containing the loss
+ bool sizeAverage, // if true, the loss is normalized by **total number of elements**
+ accreal margin); // a margin that is required for the loss to be 0
+
+TH_API void THNN_(MarginCriterion_updateGradInput)(
+ THNNState *state, // library's state
+ THTensor *input, // input tensor
+ THTensor *target, // target tensor (should contin only 1s and -1s)
+ THTensor *gradInput, // [OUT] gradient w.r.t. module's input
+ bool sizeAverage, // if true, the gradient is normalized by **total number of elements**
+ accreal margin); // a margin that is required for the loss to be 0
+
+TH_API void THNN_(SoftMarginCriterion_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *target,
+ THTensor *output,
+ bool sizeAverage);
+
+TH_API void THNN_(SoftMarginCriterion_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *target,
+ THTensor *gradInput,
+ bool sizeAverage);
+
+TH_API void THNN_(MSECriterion_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *target,
+ THTensor *output,
+ bool sizeAverage);
+TH_API void THNN_(MSECriterion_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *target,
+ THTensor *gradInput,
+ bool sizeAverage);
+
+TH_API void THNN_(MultiLabelMarginCriterion_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THIndexTensor *target,
+ THTensor *output,
+ THTensor *isTarget,
+ bool sizeAverage);
+TH_API void THNN_(MultiLabelMarginCriterion_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THIndexTensor *target,
+ THTensor *gradInput,
+ THTensor *isTarget,
+ bool sizeAverage);
+
+TH_API void THNN_(MultiMarginCriterion_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THIndexTensor *target,
+ THTensor *output,
+ bool sizeAverage,
+ int p,
+ THTensor* weights, // [OPTIONAL]
+ accreal margin);
+TH_API void THNN_(MultiMarginCriterion_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THIndexTensor *target,
+ THTensor *gradInput,
+ bool sizeAverage,
+ int p,
+ THTensor *weights, // [OPTIONAL]
+ accreal margin);
+
+TH_API void THNN_(PReLU_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THTensor *weight,
+ THIndex_t nOutputPlane);
+TH_API void THNN_(PReLU_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *weight,
+ THIndex_t nOutputPlane);
+TH_API void THNN_(PReLU_accGradParameters)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *weight,
+ THTensor *gradWeight,
+ THTensor *gradWeightBuf,
+ THTensor *gradWeightBuf2,
+ THIndex_t nOutputPlane,
+ accreal scale);
+
+TH_API void THNN_(Linear_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THTensor *weight,
+ THTensor *bias,
+ THTensor *addBuffer);
+TH_API void THNN_(Linear_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *weight);
+TH_API void THNN_(Linear_accGradParameters)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *weight,
+ THTensor *bias,
+ THTensor *gradWeight,
+ THTensor *gradBias,
+ THTensor *addBuffer,
+ accreal scale);
+
+TH_API void THNN_(RReLU_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THTensor *noise,
+ accreal lower,
+ accreal upper,
+ bool train,
+ bool inplace,
+ THGenerator *generator);
+TH_API void THNN_(RReLU_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *noise,
+ accreal lower,
+ accreal upper,
+ bool train,
+ bool inplace);
+
+TH_API void THNN_(Sigmoid_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output);
+TH_API void THNN_(Sigmoid_updateGradInput)(
+ THNNState *state,
+ THTensor *input, // [OPTIONAL]
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *output);
+
+TH_API void THNN_(SmoothL1Criterion_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *target,
+ THTensor *output,
+ bool sizeAverage);
+TH_API void THNN_(SmoothL1Criterion_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *target,
+ THTensor *gradInput,
+ bool sizeAverage);
+
+TH_API void THNN_(SoftMax_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output);
+TH_API void THNN_(SoftMax_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *output);
+
+TH_API void THNN_(SoftPlus_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ accreal beta,
+ accreal threshold);
+TH_API void THNN_(SoftPlus_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *output,
+ accreal beta,
+ accreal threshold);
+
+TH_API void THNN_(SoftShrink_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ accreal lambda);
+TH_API void THNN_(SoftShrink_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ accreal lambda);
+
+
+TH_API void THNN_(IndexLinear_updateOutput)(
+ THNNState *state,
+ THIndexTensor *keys,
+ long keysOffset,
+ THTensor *values,
+ THIndexTensor *sizes,
+ THIndexTensor *cumSumSizes,
+ THTensor *output,
+ THTensor *weight,
+ THTensor *bias,
+ THTensor *normalizedValues,
+ int train);
+TH_API void THNN_(IndexLinear_accGradParameters)(
+ THNNState *state,
+ THIndexTensor *keys,
+ long keysOffset,
+ THTensor *values,
+ THIndexTensor *sizes,
+ THIndexTensor *cumSumSizes,
+ THTensor *gradOutput,
+ THTensor *gradWeight,
+ THTensor *gradBias,
+ THTensor *weight,
+ THTensor *bias,
+ THTensor* valuesBuffer,
+ accreal weightDecay,
+ accreal scale);
+TH_API void THNN_(IndexLinear_accUpdateGradParameters)(
+ THNNState *state,
+ THIndexTensor *keys,
+ long keysOffset,
+ THTensor *values,
+ THIndexTensor *sizes,
+ THIndexTensor *cumSumSizes,
+ THTensor *gradOutput,
+ THTensor *weight,
+ THTensor *bias,
+ accreal weightDecay,
+ accreal scale);
+TH_API void THNN_(IndexLinear_updateParameters)(
+ THNNState *state,
+ THTensor *gradWeight,
+ THTensor *gradBias,
+ THTensor *weight,
+ THTensor *bias,
+ THIndexTensor *runningKeys,
+ THIndexTensor *cumSumSizes,
+ long keysOffset,
+ accreal weightDecay,
+ accreal learningRate);
+
+TH_API void THNN_(SparseLinear_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THTensor *weight,
+ THTensor *bias);
+TH_API void THNN_(SparseLinear_accGradParameters)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradWeight,
+ THTensor *gradBias,
+ THTensor *weight,
+ THTensor *bias,
+ accreal weightDecay,
+ accreal scale);
+TH_API void THNN_(SparseLinear_zeroGradParameters)(
+ THNNState *state,
+ THTensor *gradWeight,
+ THTensor *gradBias,
+ THTensor *lastInput);
+TH_API void THNN_(SparseLinear_updateParameters)(
+ THNNState *state,
+ THTensor *weight,
+ THTensor *bias,
+ THTensor *gradWeight,
+ THTensor *gradBias,
+ THTensor *lastInput,
+ accreal learningRate);
+TH_API void THNN_(SparseLinear_legacyUpdateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THTensor *weight,
+ THTensor *bias);
+TH_API void THNN_(SparseLinear_legacyAccGradParameters)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradWeight,
+ THTensor *gradBias,
+ THTensor *weight,
+ THTensor *bias,
+ accreal weightDecay,
+ accreal scale);
+TH_API void THNN_(SparseLinear_legacyZeroGradParameters)(
+ THNNState *state,
+ THTensor *gradWeight,
+ THTensor *gradBias,
+ THTensor *lastInput);
+TH_API void THNN_(SparseLinear_legacyUpdateParameters)(
+ THNNState *state,
+ THTensor *weight,
+ THTensor *bias,
+ THTensor *gradWeight,
+ THTensor *gradBias,
+ THTensor *lastInput,
+ accreal learningRate);
+
+TH_API void THNN_(Sqrt_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ accreal eps);
+TH_API void THNN_(Sqrt_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *output);
+
+TH_API void THNN_(Square_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output);
+TH_API void THNN_(Square_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput);
+
+TH_API void THNN_(Tanh_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output);
+TH_API void THNN_(Tanh_updateGradInput)(
+ THNNState *state,
+ THTensor *input, // [OPTIONAL]
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *output);
+
+TH_API void THNN_(Threshold_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ accreal threshold,
+ accreal val,
+ bool inplace);
+TH_API void THNN_(Threshold_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ accreal threshold,
+ accreal val,
+ bool inplace);
+
+TH_API void THNN_(TemporalConvolution_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THTensor *weight,
+ THTensor *bias,
+ int kW, int dW,
+ int inputFrameSize,
+ int outputFrameSize);
+TH_API void THNN_(TemporalConvolution_updateGradInput)(
+ THNNState* state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *weight,
+ int kW, int dW);
+TH_API void THNN_(TemporalConvolution_accGradParameters)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradWeight,
+ THTensor *gradBias,
+ int kW, int dW,
+ accreal scale);
+TH_API void THNN_(TemporalMaxPooling_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THIndexTensor *indices,
+ int kW, int dW);
+TH_API void THNN_(TemporalMaxPooling_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THIndexTensor *indices,
+ int kW, int dW);
+TH_API void THNN_(TemporalSubSampling_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THTensor *weight,
+ THTensor *bias,
+ int kW, int dW,
+ int inputFrameSize);
+TH_API void THNN_(TemporalSubSampling_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *weight,
+ int kW, int dW);
+TH_API void THNN_(TemporalSubSampling_accGradParameters)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradWeight,
+ THTensor *gradBias,
+ int kW, int dW,
+ accreal scale);
+
+TH_API void THNN_(TemporalRowConvolution_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THTensor *weight,
+ THTensor *bias,
+ THTensor *finput,
+ THTensor *fgradInput,
+ int kW,
+ int dW,
+ int padW,
+ bool featFirst);
+TH_API void THNN_(TemporalRowConvolution_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *weight,
+ THTensor *finput,
+ THTensor *fgradInput,
+ int kW,
+ int dW,
+ int padW,
+ bool featFirst);
+TH_API void THNN_(TemporalRowConvolution_accGradParameters)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradWeight,
+ THTensor *gradBias,
+ THTensor *finput,
+ THTensor *fgradInput,
+ int kW,
+ int dW,
+ int padW,
+ bool featFirst,
+ accreal scale);
+
+TH_API void THNN_(BatchNormalization_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THTensor *weight, // [OPTIONAL]
+ THTensor *bias, // [OPTIONAL]
+ THTensor *running_mean,
+ THTensor *running_var,
+ THTensor *save_mean,
+ THTensor *save_std,
+ bool train,
+ double momentum,
+ double eps);
+TH_API void THNN_(BatchNormalization_backward)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput, // [OPTIONAL]
+ THTensor *gradWeight, // [OPTIONAL]
+ THTensor *gradBias, // [OPTIONAL]
+ THTensor *weight, // [OPTIONAL]
+ THTensor *running_mean,
+ THTensor *running_var,
+ THTensor *save_mean,
+ THTensor *save_std,
+ bool train,
+ double scale,
+ double eps);
+
+TH_API void THNN_(SpatialConvolutionMap_updateOutput)(
+ THNNState *state, // library state
+ THTensor *input, // input tensor
+ THTensor *output, // [OUT] convolution output
+ THTensor *weight, // 3D weight tensor (connTable:size(1) x kH x kW)
+ THTensor *bias, // 1D bias tensor (nOutputPlane)
+ THTensor *connTable, // connection table
+ int nInputPlane, // number of input planes
+ int nOutputPlane, // number of output planes
+ int dW, int dH); // stride
+TH_API void THNN_(SpatialConvolutionMap_updateGradInput)(
+ THNNState *state, // library state
+ THTensor *input, // input tensor
+ THTensor *gradOutput, // gradient w.r.t. output
+ THTensor *gradInput, // [OUT] gradient w.r.t. input
+ THTensor *weight, // 3D weight tensor (connTable:size(1) x kH x kW)
+ THTensor *bias, // 1D bias tensor (nOutputPlane)
+ THTensor *connTable, // connection table
+ int nInputPlane, // number of input planes
+ int nOutputPlane, // number of output planes
+ int dW, int dH); // stride
+TH_API void THNN_(SpatialConvolutionMap_accGradParameters)(
+ THNNState *state, // library state
+ THTensor *input, // input tensor
+ THTensor *gradOutput, // gradient w.r.t. output
+ THTensor *gradWeight, // 3D gradWeight tensor (connTable:size(1) x kH x kW)
+ THTensor *gradBias, // 1D gradBias tensor (nOutputPlane)
+ THTensor *connTable, // connection table
+ int nInputPlane, // number of input planes
+ int nOutputPlane, // number of output planes
+ int dW, int dH, // stride
+ accreal scale); // scaling factor
+
+TH_API void THNN_(SpatialConvolutionMM_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THTensor *weight,
+ THTensor *bias, // [OPTIONAL]
+ THTensor *finput,
+ THTensor *fgradInput,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH);
+TH_API void THNN_(SpatialConvolutionMM_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *weight,
+ THTensor *finput,
+ THTensor *fgradInput,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH);
+TH_API void THNN_(SpatialConvolutionMM_accGradParameters)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradWeight,
+ THTensor *gradBias, // [OPTIONAL]
+ THTensor *finput,
+ THTensor *fgradInput,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ accreal scale);
+
+TH_API void THNN_(SpatialDepthWiseConvolution_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THTensor *weight,
+ THTensor *bias, // [OPTIONAL]
+ THTensor *finput,
+ THTensor *fgradInput,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH);
+TH_API void THNN_(SpatialDepthWiseConvolution_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *weight,
+ THTensor *finput,
+ THTensor *fgradInput,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH);
+TH_API void THNN_(SpatialDepthWiseConvolution_accGradParameters)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradWeight,
+ THTensor *gradBias, // [OPTIONAL]
+ THTensor *finput,
+ THTensor *fgradInput,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ accreal scale);
+
+TH_API void THNN_(SpatialConvolutionLocal_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THTensor *weight,
+ THTensor *bias,
+ THTensor *finput,
+ THTensor *fgradInput,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ long inputWidth, long inputHeight,
+ long outputWidth, long outputHeight);
+TH_API void THNN_(SpatialConvolutionLocal_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *weight,
+ THTensor *finput,
+ THTensor *fgradInput,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ long inputWidth, long inputHeight,
+ long outputWidth, long outputHeight);
+TH_API void THNN_(SpatialConvolutionLocal_accGradParameters)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradWeight,
+ THTensor *gradBias,
+ THTensor *finput,
+ THTensor *fgradInput,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ long inputWidth, long inputHeight,
+ long outputWidth, long outputHeight,
+ accreal scale);
+
+TH_API void THNN_(SpatialAdaptiveMaxPooling_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THIndexTensor *indices,
+ int owidth, int oheight);
+TH_API void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THIndexTensor *indices);
+
+TH_API void THNN_(SpatialAdaptiveAveragePooling_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ int owidth, int oheight);
+TH_API void THNN_(SpatialAdaptiveAveragePooling_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput);
+
+TH_API void THNN_(SpatialAveragePooling_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ bool ceil_mode,
+ bool count_include_pad);
+TH_API void THNN_(SpatialAveragePooling_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ bool ceil_mode,
+ bool count_include_pad);
+
+TH_API void THNN_(SpatialFractionalMaxPooling_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ int outputW, int outputH,
+ int poolSizeW, int poolSizeH,
+ THIndexTensor *indices,
+ THTensor *randomSamples);
+TH_API void THNN_(SpatialFractionalMaxPooling_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ int outputW, int outputH,
+ int poolSizeW, int poolSizeH,
+ THIndexTensor *indices);
+
+TH_API void THNN_(SpatialFullConvolution_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THTensor *weight,
+ THTensor *bias, // [OPTIONAL]
+ THTensor *columns,
+ THTensor *ones,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ int adjW, int adjH);
+TH_API void THNN_(SpatialFullConvolution_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *weight,
+ THTensor *gradColumns,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ int adjW, int adjH);
+TH_API void THNN_(SpatialFullConvolution_accGradParameters)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradWeight,
+ THTensor *gradBias, // [OPTIONAL]
+ THTensor *columns,
+ THTensor *ones,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ int adjW, int adjH,
+ accreal scale);
+
+TH_API void THNN_(SpatialFullConvolutionMap_updateOutput)(
+ THNNState *state, // library state
+ THTensor *input, // input tensor
+ THTensor *output, // [OUT] convolution output
+ THTensor *weight, // 3D weight tensor (connTable:size(1) x kH x kW)
+ THTensor *bias, // 1D bias tensor (nOutputPlane)
+ THTensor *connTable, // connection table
+ int nInputPlane, // number of input planes
+ int nOutputPlane, // number of output planes
+ int dW, int dH); // stride
+TH_API void THNN_(SpatialFullConvolutionMap_updateGradInput)(
+ THNNState *state, // library state
+ THTensor *input, // input tensor
+ THTensor *gradOutput, // gradient w.r.t. output
+ THTensor *gradInput, // [OUT] gradient w.r.t. input
+ THTensor *weight, // 3D weight tensor (connTable:size(1) x kH x kW)
+ THTensor *bias, // 1D bias tensor (nOutputPlane)
+ THTensor *connTable, // connection table
+ int nInputPlane, // number of input planes
+ int nOutputPlane, // number of output planes
+ int dW, int dH); // stride
+TH_API void THNN_(SpatialFullConvolutionMap_accGradParameters)(
+ THNNState *state, // library state
+ THTensor *input, // input tensor
+ THTensor *gradOutput, // gradient w.r.t. output
+ THTensor *gradWeight, // 3D gradWeight tensor (connTable:size(1) x kH x kW)
+ THTensor *gradBias, // 1D gradBias tensor (nOutputPlane)
+ THTensor *connTable, // connection table
+ int nInputPlane, // number of input planes
+ int nOutputPlane, // number of output planes
+ int dW, int dH, // stride
+ accreal scale); // scaling factor
+
+TH_API void THNN_(SpatialDilatedConvolution_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THTensor *weight,
+ THTensor *bias, // [OPTIONAL]
+ THTensor *columns,
+ THTensor *ones,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ int dilationW, int dilationH);
+
+TH_API void THNN_(SpatialDilatedConvolution_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *weight,
+ THTensor *gradColumns,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ int dilationW, int dilationH);
+
+TH_API void THNN_(SpatialDilatedConvolution_accGradParameters)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradWeight,
+ THTensor *gradBias, // [OPTIONAL]
+ THTensor *columns,
+ THTensor *ones,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ int dilationW, int dilationH,
+ accreal scale);
+
+TH_API void THNN_(SpatialMaxPooling_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THIndexTensor *indices,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ bool ceil_mode);
+TH_API void THNN_(SpatialMaxPooling_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THIndexTensor *indices,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ bool ceil_mode);
+
+TH_API void THNN_(SpatialDilatedMaxPooling_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THIndexTensor *indices,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ int dilationW, int dilationH,
+ bool ceil_mode);
+TH_API void THNN_(SpatialDilatedMaxPooling_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THIndexTensor *indices,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ int dilationW, int dilationH,
+ bool ceil_mode);
+
+TH_API void THNN_(SpatialMaxUnpooling_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THIndexTensor *indices,
+ int owidth, int oheight);
+TH_API void THNN_(SpatialMaxUnpooling_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THIndexTensor *indices,
+ int owidth, int oheight);
+
+TH_API void THNN_(SpatialSubSampling_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THTensor *weight,
+ THTensor *bias,
+ int kW, int kH,
+ int dW, int dH);
+TH_API void THNN_(SpatialSubSampling_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *weight,
+ int kW, int kH,
+ int dW, int dH);
+TH_API void THNN_(SpatialSubSampling_accGradParameters)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradWeight,
+ THTensor *gradBias,
+ int kW, int kH,
+ int dW, int dH,
+ accreal scale);
+
+TH_API void THNN_(SpatialUpSamplingNearest_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ int scale_factor);
+TH_API void THNN_(SpatialUpSamplingNearest_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ int scale_factor);
+
+TH_API void THNN_(SpatialUpSamplingBilinear_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ int outputHeight,
+ int outputWidth);
+TH_API void THNN_(SpatialUpSamplingBilinear_updateGradInput)(
+ THNNState *state,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ int nbatch,
+ int nchannels,
+ int inputHeight,
+ int inputWidth,
+ int outputHeight,
+ int outputWidth);
+
+TH_API void THNN_(unfolded_acc)(
+ THTensor *finput,
+ THTensor *input,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ int nInputPlane,
+ int inputWidth, int inputHeight,
+ int outputWidth, int outputHeight);
+TH_API void THNN_(unfolded_copy)(
+ THTensor *finput,
+ THTensor *input,
+ int kW, int kH,
+ int dW, int dH,
+ int padW, int padH,
+ int nInputPlane,
+ int inputWidth, int inputHeight,
+ int outputWidth, int outputHeight);
+
+TH_API void THNN_(VolumetricAveragePooling_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ int kT, int kW, int kH,
+ int dT, int dW, int dH);
+TH_API void THNN_(VolumetricAveragePooling_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ int kT, int kW, int kH,
+ int dT, int dW, int dH);
+
+TH_API void THNN_(VolumetricConvolution_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THTensor *weight,
+ THTensor *bias, // [OPTIONAL]
+ THTensor *finput,
+ THTensor *fgradInput,
+ int dT, int dW, int dH,
+ int pT, int pW, int pH);
+TH_API void THNN_(VolumetricConvolution_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *weight,
+ THTensor *finput,
+ int dT, int dW, int dH,
+ int pT, int pW, int pH);
+TH_API void THNN_(VolumetricConvolution_accGradParameters)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradWeight,
+ THTensor *gradBias, // [OPTIONAL]
+ THTensor *finput,
+ THTensor *fgradInput,
+ int dT, int dW, int dH,
+ int pT, int pW, int pH,
+ accreal scale);
+
+TH_API void THNN_(VolumetricConvolutionMM_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THTensor *weight,
+ THTensor *bias, // [OPTIONAL]
+ THTensor *finput,
+ int kT, int kW, int kH,
+ int dT, int dW, int dH,
+ int pT, int pW, int pH);
+TH_API void THNN_(VolumetricConvolutionMM_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *weight,
+ THTensor *finput,
+ THTensor *fgradInput,
+ int kT, int kW, int kH,
+ int dT, int dW, int dH,
+ int pT, int pW, int pH);
+TH_API void THNN_(VolumetricConvolutionMM_accGradParameters)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradWeight,
+ THTensor *gradBias, // [OPTIONAL]
+ THTensor *finput,
+ int kT, int kW, int kH,
+ int dT, int dW, int dH,
+ int pT, int pW, int pH,
+ accreal scale);
+
+TH_API void THNN_(VolumetricFractionalMaxPooling_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ int outputT, int outputW, int outputH,
+ int poolSizeT, int poolSizeW, int poolSizeH,
+ THIndexTensor *indices,
+ THTensor *randomSamples);
+TH_API void THNN_(VolumetricFractionalMaxPooling_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ int outputT, int outputW, int outputH,
+ int poolSizeT, int poolSizeW, int poolSizeH,
+ THIndexTensor *indices);
+
+TH_API void THNN_(VolumetricFullConvolution_updateOutput)(
+ THNNState *state, // library state
+ THTensor *input, // 4D or 5D (batch) tensor
+ THTensor *output, // [OUT] volumetric convolution output
+ THTensor *weight, // weight tensor (nInputPlane x nOutputPlane x kT x kH x kW)
+ THTensor *bias, // [OPTIONAL] gradBias tensor (nOutputPlane)
+ THTensor *finput, // [OUT] internal columns buffer
+ THTensor *fgradInput, // [OUT] internal ones buffer
+ int dT, int dW, int dH, // stride of the convolution
+ int pT, int pW, int pH, // padding
+ int aT, int aW, int aH); // extra output adjustment
+TH_API void THNN_(VolumetricFullConvolution_updateGradInput)(
+ THNNState *state, // library state
+ THTensor *input, // 4D or 5D (batch) tensor
+ THTensor *gradOutput, // gradient w.r.t. output
+ THTensor *gradInput, // [OUT] gradient w.r.t. input
+ THTensor *weight, // weight tensor (nInputPlane x nOutputPlane x kT x kH x kW)
+ THTensor *finput, // internal columns buffer
+ THTensor *fgradInput, // internal ones buffer
+ int dT, int dW, int dH, // stride
+ int pT, int pW, int pH, // padding
+ int aT, int aW, int aH); // extra output adjustment
+TH_API void THNN_(VolumetricFullConvolution_accGradParameters)(
+ THNNState *state, // library state
+ THTensor *input, // 4D or 5D (batch) tensor
+ THTensor *gradOutput, // gradient w.r.t. output
+ THTensor *gradWeight, // gradWeight tensor (nInputPlane x nOutputPlane x kT x kH x kW)
+ THTensor *gradBias, // [OPTIONAL] gradBias tensor (nOutputPlane)
+ THTensor *finput, // internal columns buffer
+ THTensor *fgradInput, // internal ones buffer
+ int dT, int dW, int dH, // stride
+ int pT, int pW, int pH, // padding
+ int aT, int aW, int aH, // extra output adjustment
+ accreal scale); // scaling factor
+
+TH_API void THNN_(VolumetricDilatedConvolution_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THTensor *weight,
+ THTensor *bias, // [OPTIONAL]
+ THTensor *columns,
+ THTensor *ones,
+ int kT, int kW, int kH,
+ int dT, int dW, int dH,
+ int padT, int padW, int padH,
+ int dilationT, int dilationW, int dilationH);
+
+TH_API void THNN_(VolumetricDilatedConvolution_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *weight,
+ THTensor *gradColumns,
+ int kT, int kW, int kH,
+ int dT, int dW, int dH,
+ int padT, int padW, int padH,
+ int dilationT, int dilationW, int dilationH);
+
+TH_API void THNN_(VolumetricDilatedConvolution_accGradParameters)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradWeight,
+ THTensor *gradBias, // [OPTIONAL]
+ THTensor *columns,
+ THTensor *ones,
+ int kT, int kW, int kH,
+ int dT, int dW, int dH,
+ int padT, int padW, int padH,
+ int dilationT, int dilationW, int dilationH,
+ accreal scale);
+
+TH_API void THNN_(VolumetricMaxPooling_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THIndexTensor *indices,
+ int kT, int kW, int kH,
+ int dT, int dW, int dH,
+ int pT, int pW, int pH,
+ bool ceilMode);
+TH_API void THNN_(VolumetricMaxPooling_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THIndexTensor *indices,
+ int kT, int kW, int kH,
+ int dT, int dW, int dH,
+ int pT, int pW, int pH,
+ bool ceilMode);
+
+TH_API void THNN_(VolumetricDilatedMaxPooling_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THIndexTensor *indices,
+ int kT, int kW, int kH,
+ int dT, int dW, int dH,
+ int pT, int pW, int pH,
+ int dilationT, int dilationW, int dilationH,
+ bool ceilMode);
+TH_API void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THIndexTensor *indices,
+ int kT, int kW, int kH,
+ int dT, int dW, int dH,
+ int pT, int pW, int pH,
+ int dilationT, int dilationW, int dilationH,
+ bool ceilMode);
+
+TH_API void THNN_(VolumetricMaxUnpooling_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THIndexTensor *indices,
+ int oT, int oW, int oH,
+ int dT, int dW, int dH,
+ int pT, int pW, int pH);
+TH_API void THNN_(VolumetricMaxUnpooling_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THIndexTensor *indices,
+ int oT, int oW, int oH,
+ int dT, int dW, int dH,
+ int pT, int pW, int pH);
+
+TH_API void THNN_(SpatialReflectionPadding_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ int pad_l, int pad_r,
+ int pad_t, int pad_b);
+
+TH_API void THNN_(SpatialReflectionPadding_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ int pad_l, int pad_r,
+ int pad_t, int pad_b);
+
+TH_API void THNN_(SpatialReplicationPadding_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ int pad_l, int pad_r,
+ int pad_t, int pad_b);
+
+TH_API void THNN_(SpatialReplicationPadding_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ int pad_l, int pad_r,
+ int pad_t, int pad_b);
+
+TH_API void THNN_(VolumetricReplicationPadding_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ int pleft, int pright,
+ int ptop, int pbottom,
+ int pfront, int pback);
+
+TH_API void THNN_(VolumetricReplicationPadding_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ int pleft, int pright,
+ int ptop, int pbottom,
+ int pfront, int pback);
+
+TH_API void THNN_(VolumetricUpSamplingNearest_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ int scale_factor);
+TH_API void THNN_(VolumetricUpSamplingNearest_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ int scale_factor);
+
+TH_API void THNN_(VolumetricUpSamplingTrilinear_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ int outputDepth,
+ int outputHeight,
+ int outputWidth);
+TH_API void THNN_(VolumetricUpSamplingTrilinear_updateGradInput)(
+ THNNState *state,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ int nbatch,
+ int nchannels,
+ int inputDepth,
+ int inputHeight,
+ int inputWidth,
+ int outputDepth,
+ int outputHeight,
+ int outputWidth);
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/Tanh.c b/contrib/lua-torch/nn/lib/THNN/generic/Tanh.c
new file mode 100644
index 000000000..ecf0708c2
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/Tanh.c
@@ -0,0 +1,49 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Tanh.c"
+#else
+
+void THNN_(Tanh_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output)
+{
+ THTensor_(tanh)(output, input);
+}
+
+void THNN_(Tanh_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *output)
+{
+ THNN_CHECK_SHAPE(output, gradOutput);
+ THTensor_(resizeAs)(gradInput, output);
+
+ if (output->nDimension == 1 ||
+ !THTensor_(isContiguous)(output) ||
+ !THTensor_(isContiguous)(gradOutput) ||
+ !THTensor_(isContiguous)(gradInput))
+ {
+ TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output,
+ real z = *output_data; \
+ *gradInput_data = *gradOutput_data * (1. - z*z);
+ );
+ }
+ else
+ {
+ real* ptr_gradOutput = THTensor_(data)(gradOutput);
+ real* ptr_gradInput = THTensor_(data)(gradInput);
+ real* ptr_output = THTensor_(data)(output);
+ long i;
+
+#pragma omp parallel for private(i)
+ for (i = 0; i < THTensor_(nElement)(gradInput); i++)
+ {
+ real z = ptr_output[i];
+ ptr_gradInput[i] = ptr_gradOutput[i] * (1. - z*z);
+ }
+ }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/TemporalConvolution.c b/contrib/lua-torch/nn/lib/THNN/generic/TemporalConvolution.c
new file mode 100644
index 000000000..8cfd97d85
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/TemporalConvolution.c
@@ -0,0 +1,398 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/TemporalConvolution.c"
+#else
+
+static inline void THNN_(TemporalConvolution_shapeCheck)(
+ THNNState *state,
+ THTensor *input,
+ int kW,
+ int dW,
+ int *inputFrameSize) {
+
+ THArgCheck(kW > 0, 9,
+ "kernel size should be greater than zero, but got kW: %d", kW);
+ THArgCheck(dW > 0, 11,
+ "stride should be greater than zero, but got dW: %d", dW);
+
+ int dimS = 0; // sequence dimension
+ int dimF = 1; // feature dimension
+
+ if (input->nDimension == 3)
+ {
+ dimS = 1;
+ dimF = 2;
+ }
+ THNN_ARGCHECK(input->nDimension == 2 || input->nDimension == 3, 2, input,
+ "2D or 3D (batch mode) tensor expected for input, but got: %s");
+ if (inputFrameSize != NULL) {
+ THArgCheck(input->size[dimF] == *inputFrameSize, 2,
+ "invalid input frame size. Got: %d, Expected: %d",
+ input->size[dimF], *inputFrameSize);
+ }
+ THArgCheck(input->size[dimS] >= kW, 2,
+ "input sequence smaller than kernel size. Got: %d, Expected: %d",
+ input->size[dimS], kW);
+}
+
+void THNN_(TemporalConvolution_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THTensor *weight,
+ THTensor *bias,
+ int kW,
+ int dW,
+ int inputFrameSize,
+ int outputFrameSize)
+{
+ THTensor *outputWindow, *inputWindow;
+ int nInputFrame, nOutputFrame;
+ long k, i;
+
+ int dimS = 0; // sequence dimension
+ int dimF = 1; // feature dimension
+
+ if (input->nDimension == 3)
+ {
+ dimS = 1;
+ dimF = 2;
+ }
+
+ THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous");
+ THArgCheck(!bias || THTensor_(isContiguous)(bias), 5, "bias must be contiguous");
+ THNN_(TemporalConvolution_shapeCheck)
+ (state, input, kW, dW, &inputFrameSize);
+ input = THTensor_(newContiguous)(input);
+ outputWindow = THTensor_(new)();
+ inputWindow = THTensor_(new)();
+
+ nInputFrame = input->size[dimS];
+ nOutputFrame = (nInputFrame - kW) / dW + 1;
+
+ if (input->nDimension == 2)
+ {
+ THTensor_(resize2d)(output,
+ nOutputFrame,
+ outputFrameSize);
+
+ /* bias first */
+ for(k = 0; k < nOutputFrame; k++)
+ {
+ THTensor_(select)(outputWindow, output, 0, k);
+ THTensor_(copy)(outputWindow, bias);
+ }
+
+ /* ouch */
+ for(k = 0; nOutputFrame > 0; k++)
+ {
+ long outputFrameStride = (kW-1)/dW+1;
+ long inputFrameStride = outputFrameStride*dW;
+ long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+ nOutputFrame -= nFrame;
+
+ THTensor_(setStorage2d)(inputWindow, input->storage,
+ input->storageOffset+k*dW*input->size[1],
+ nFrame, inputFrameStride*input->size[1],
+ kW*input->size[1], 1);
+
+ THTensor_(setStorage2d)(outputWindow, output->storage,
+ output->storageOffset + k*output->size[1],
+ nFrame, outputFrameStride*output->size[1],
+ output->size[1], 1);
+
+ THTensor *tweight = THTensor_(new)();
+ THTensor_(transpose)(tweight, weight, 0, 1);
+ THTensor_(addmm)(outputWindow, 1, outputWindow, 1, inputWindow, tweight);
+ THTensor_(free)(tweight);
+ }
+ }
+ else
+ {
+ THTensor *outputSample = THTensor_(new)();
+ THTensor *inputSample = THTensor_(new)();
+ int nBatchFrame = input->size[0];
+
+ THTensor_(resize3d)(output,
+ nBatchFrame,
+ nOutputFrame,
+ outputFrameSize);
+
+ for(i = 0; i < nBatchFrame; i++)
+ {
+ THTensor_(select)(outputSample, output, 0, i);
+ THTensor_(select)(inputSample, input, 0, i);
+ long nOutputSampleFrame = nOutputFrame;
+
+ /* bias first */
+ for(k = 0; k < nOutputFrame; k++)
+ {
+ THTensor_(select)(outputWindow, outputSample, 0, k);
+ THTensor_(copy)(outputWindow, bias);
+ }
+
+ /* ouch */
+ for(k = 0; nOutputSampleFrame > 0; k++)
+ {
+ long outputFrameStride = (kW-1)/dW+1;
+ long inputFrameStride = outputFrameStride*dW;
+ long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+ nOutputSampleFrame -= nFrame;
+
+ THTensor_(setStorage2d)(inputWindow, inputSample->storage,
+ inputSample->storageOffset+k*dW*inputSample->size[1],
+ nFrame, inputFrameStride*inputSample->size[1],
+ kW*inputSample->size[1], 1);
+
+ THTensor_(setStorage2d)(outputWindow, outputSample->storage,
+ outputSample->storageOffset + k*outputSample->size[1],
+ nFrame, outputFrameStride*outputSample->size[1],
+ outputSample->size[1], 1);
+
+ THTensor *tweight = THTensor_(new)();
+ THTensor_(transpose)(tweight, weight, 0, 1);
+ THTensor_(addmm)(outputWindow, 1, outputWindow, 1, inputWindow, tweight);
+ THTensor_(free)(tweight);
+ }
+ }
+ THTensor_(free)(outputSample);
+ THTensor_(free)(inputSample);
+ }
+
+ THTensor_(free)(outputWindow);
+ THTensor_(free)(inputWindow);
+ THTensor_(free)(input);
+
+}
+
+void THNN_(TemporalConvolution_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *weight,
+ int kW,
+ int dW)
+{
+ long nInputFrame;
+ long nOutputFrame;
+
+ THTensor *gradOutputWindow;
+ THTensor *gradInputWindow;
+ long k, i;
+
+ int dimS = 0; // sequence dimension
+ int dimF = 1; // feature dimension
+
+ if (gradOutput->nDimension == 3)
+ {
+ dimS = 1;
+ dimF = 2;
+ }
+
+ THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous");
+ THNN_(TemporalConvolution_shapeCheck)(
+ state, input, kW, dW, NULL);
+ nInputFrame = input->size[dimS];
+ nOutputFrame = gradOutput->size[dimS];
+
+ input = THTensor_(newContiguous)(input);
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+
+ gradOutputWindow = THTensor_(new)();
+ gradInputWindow = THTensor_(new)();
+
+ THTensor_(resizeAs)(gradInput, input);
+ THTensor_(zero)(gradInput);
+
+ if (gradOutput->nDimension == 2)
+ {
+ /* ouch */
+ for(k = 0; nOutputFrame > 0; k++)
+ {
+ long outputFrameStride = (kW-1)/dW+1;
+ long inputFrameStride = outputFrameStride*dW;
+ long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+ nOutputFrame -= nFrame;
+
+ THTensor_(setStorage2d)(gradOutputWindow, gradOutput->storage,
+ gradOutput->storageOffset + k*gradOutput->size[1],
+ nFrame, outputFrameStride*gradOutput->size[1],
+ gradOutput->size[1], 1);
+
+ THTensor_(setStorage2d)(gradInputWindow, gradInput->storage,
+ gradInput->storageOffset+k*dW*gradInput->size[1],
+ nFrame, inputFrameStride*gradInput->size[1],
+ kW*gradInput->size[1], 1);
+
+ THTensor_(addmm)(gradInputWindow, 1, gradInputWindow, 1, gradOutputWindow, weight);
+ }
+ }
+ else
+ {
+ THTensor *gradOutputSample = THTensor_(new)();
+ THTensor *gradInputSample = THTensor_(new)();
+ int nBatchFrame = input->size[0];
+
+ for(i = 0; i < nBatchFrame; i++)
+ {
+ THTensor_(select)(gradOutputSample, gradOutput, 0, i);
+ THTensor_(select)(gradInputSample, gradInput, 0, i);
+ int nOutputSampleFrame = nOutputFrame;
+
+ /* ouch */
+ for(k = 0; nOutputSampleFrame > 0; k++)
+ {
+ long outputFrameStride = (kW-1)/dW+1;
+ long inputFrameStride = outputFrameStride*dW;
+ long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+ nOutputSampleFrame -= nFrame;
+
+ THTensor_(setStorage2d)(gradOutputWindow, gradOutputSample->storage,
+ gradOutputSample->storageOffset + k*gradOutputSample->size[1],
+ nFrame, outputFrameStride*gradOutputSample->size[1],
+ gradOutputSample->size[1], 1);
+
+ THTensor_(setStorage2d)(gradInputWindow, gradInputSample->storage,
+ gradInputSample->storageOffset+k*dW*gradInputSample->size[1],
+ nFrame, inputFrameStride*gradInputSample->size[1],
+ kW*gradInputSample->size[1], 1);
+
+ THTensor_(addmm)(gradInputWindow, 1, gradInputWindow, 1, gradOutputWindow, weight);
+ }
+ }
+ THTensor_(free)(gradOutputSample);
+ THTensor_(free)(gradInputSample);
+ }
+
+ THTensor_(free)(gradOutputWindow);
+ THTensor_(free)(gradInputWindow);
+ THTensor_(free)(gradOutput);
+ THTensor_(free)(input);
+
+}
+
+void THNN_(TemporalConvolution_accGradParameters)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradWeight,
+ THTensor *gradBias,
+ int kW,
+ int dW,
+ accreal scale_)
+{
+ real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+ long nInputFrame;
+ long nOutputFrame;
+
+ THTensor *gradOutputWindow;
+ THTensor *inputWindow;
+ long k, i;
+
+ int dimS = 0; // sequence dimension
+ int dimF = 1; // feature dimension
+
+ if (gradOutput->nDimension == 3)
+ {
+ dimS = 1;
+ dimF = 2;
+ }
+
+ THNN_(TemporalConvolution_shapeCheck)(
+ state, input, kW, dW, NULL);
+ nInputFrame = input->size[dimS];
+ nOutputFrame = gradOutput->size[dimS];
+
+ input = THTensor_(newContiguous)(input);
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+ gradOutputWindow = THTensor_(new)();
+ inputWindow = THTensor_(new)();
+
+ if (input->nDimension == 2)
+ {
+ /* bias first */
+ for(k = 0; k < nOutputFrame; k++)
+ {
+ THTensor_(select)(gradOutputWindow, gradOutput, 0, k);
+ THTensor_(cadd)(gradBias, gradBias, scale, gradOutputWindow);
+ }
+
+ /* ouch */
+ for(k = 0; nOutputFrame > 0; k++)
+ {
+ long outputFrameStride = (kW-1)/dW+1;
+ long inputFrameStride = outputFrameStride*dW;
+ long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+ nOutputFrame -= nFrame;
+
+ THTensor_(setStorage2d)(inputWindow, input->storage,
+ input->storageOffset+k*dW*input->size[1],
+ nFrame, inputFrameStride*input->size[1],
+ kW*input->size[1], 1);
+
+ THTensor_(setStorage2d)(gradOutputWindow, gradOutput->storage,
+ gradOutput->storageOffset + k*gradOutput->size[1],
+ nFrame, outputFrameStride*gradOutput->size[1],
+ gradOutput->size[1], 1);
+
+ THTensor *tgradOutputWindow = THTensor_(new)();
+ THTensor_(transpose)(tgradOutputWindow, gradOutputWindow, 0, 1);
+ THTensor_(addmm)(gradWeight, 1, gradWeight, scale, tgradOutputWindow, inputWindow);
+ THTensor_(free)(tgradOutputWindow);
+ }
+ }
+ else
+ {
+ THTensor *gradOutputSample = THTensor_(new)();
+ THTensor *inputSample = THTensor_(new)();
+ int nBatchFrame = input->size[0];
+
+ for(i = 0; i < nBatchFrame; i++)
+ {
+ THTensor_(select)(gradOutputSample, gradOutput, 0, i);
+ THTensor_(select)(inputSample, input, 0, i);
+ int nOutputSampleFrame = nOutputFrame;
+
+ /* bias first */
+ for(k = 0; k < nOutputFrame; k++)
+ {
+ THTensor_(select)(gradOutputWindow, gradOutputSample, 0, k);
+ THTensor_(cadd)(gradBias, gradBias, scale, gradOutputWindow);
+ }
+
+ /* ouch */
+ for(k = 0; nOutputSampleFrame > 0; k++)
+ {
+ long outputFrameStride = (kW-1)/dW+1;
+ long inputFrameStride = outputFrameStride*dW;
+ long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1;
+ nOutputSampleFrame -= nFrame;
+
+ THTensor_(setStorage2d)(inputWindow, inputSample->storage,
+ inputSample->storageOffset+k*dW*inputSample->size[1],
+ nFrame, inputFrameStride*inputSample->size[1],
+ kW*inputSample->size[1], 1);
+
+ THTensor_(setStorage2d)(gradOutputWindow, gradOutputSample->storage,
+ gradOutputSample->storageOffset + k*gradOutputSample->size[1],
+ nFrame, outputFrameStride*gradOutputSample->size[1],
+ gradOutputSample->size[1], 1);
+
+ THTensor *tgradOutputWindow = THTensor_(new)();
+ THTensor_(transpose)(tgradOutputWindow, gradOutputWindow, 0, 1);
+ THTensor_(addmm)(gradWeight, 1, gradWeight, scale, tgradOutputWindow, inputWindow);
+ THTensor_(free)(tgradOutputWindow);
+ }
+ }
+ THTensor_(free)(gradOutputSample);
+ THTensor_(free)(inputSample);
+ }
+
+ THTensor_(free)(gradOutputWindow);
+ THTensor_(free)(inputWindow);
+ THTensor_(free)(gradOutput);
+ THTensor_(free)(input);
+
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/TemporalMaxPooling.c b/contrib/lua-torch/nn/lib/THNN/generic/TemporalMaxPooling.c
new file mode 100644
index 000000000..344c1b3fd
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/TemporalMaxPooling.c
@@ -0,0 +1,283 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/TemporalMaxPooling.c"
+#else
+
+static inline void THNN_(TemporalMaxPooling_shapeCheck)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THIndexTensor *indices,
+ int kW,
+ int dW) {
+ long niframe;
+ long framesize;
+ long noframe;
+
+ int dimS = 0; // sequence dimension
+ int dimF = 1; // feature dimension
+ int ndims = input->nDimension;
+
+ if (input->nDimension == 3)
+ {
+ dimS = 1;
+ dimF = 2;
+ }
+
+ niframe = input->size[dimS];
+ framesize = input->size[dimF];
+ noframe = (niframe - kW) / dW + 1;
+
+ THArgCheck(kW > 0, 5,
+ "kernel size should be greater than zero, but got kW: %d", kW);
+ THArgCheck(dW > 0, 6,
+ "stride should be greater than zero, but got dW: %d", dW);
+
+ THNN_ARGCHECK(input->nDimension == 2 || input->nDimension == 3, 2, input,
+ "2D or 3D (batch mode) tensor expected for input, but got: %s");
+ THArgCheck(input->size[dimS] >= kW, 2,
+ "input sequence smaller than kernel size. Got: %d, Expected: %d",
+ input->size[dimS], kW);
+
+ if (gradOutput != NULL) {
+ THNN_CHECK_DIM_SIZE(gradOutput, ndims, dimS, noframe);
+ THNN_CHECK_DIM_SIZE(gradOutput, ndims, dimF, framesize)
+ }
+ if (indices != NULL) {
+ THNN_CHECK_DIM_SIZE_INDICES(indices, ndims, dimS, noframe);
+ THNN_CHECK_DIM_SIZE_INDICES(indices, ndims, dimF, framesize);
+ }
+}
+
+void THNN_(TemporalMaxPooling_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THIndexTensor *indices,
+ int kW,
+ int dW)
+{
+ long niframe;
+ long framesize;
+ long noframe;
+
+ real *input_data;
+ real *output_data;
+ THIndex_t *indices_data;
+
+ long t, y;
+
+ int dimS = 0; // sequence dimension
+ int dimF = 1; // feature dimension
+
+ THNN_(TemporalMaxPooling_shapeCheck)(state, input, NULL, NULL, kW, dW);
+
+ if (input->nDimension == 3)
+ {
+ dimS = 1;
+ dimF = 2;
+ }
+
+ /* sizes */
+ niframe = input->size[dimS];
+ framesize = input->size[dimF];
+ noframe = (niframe - kW) / dW + 1;
+
+ /* get contiguous input */
+ input = THTensor_(newContiguous)(input);
+
+ if (input->nDimension == 2)
+ {
+ /* resize output */
+ THTensor_(resize2d)(output, noframe, framesize);
+
+ /* indices will contain index locations for each output point */
+ THIndexTensor_(resize2d)(indices, noframe, framesize);
+
+ /* get raw pointers */
+ input_data = THTensor_(data)(input);
+ output_data = THTensor_(data)(output);
+ indices_data = THIndexTensor_(data)(indices);
+
+ for(t = 0; t < noframe; t++)
+ {
+ real *ip = input_data + t*framesize*dW;
+ real *op = output_data + t*framesize;
+ THIndex_t *xp = indices_data + t*framesize;
+#pragma omp parallel for private(y)
+ for(y = 0; y < framesize; y++)
+ {
+ /* compute local max: */
+ long maxindex = -1;
+ real maxval = -THInf;
+ long x;
+ for(x = 0; x < kW; x++)
+ {
+ real val = ip[x*framesize+y];
+ if (val > maxval)
+ {
+ maxval = val;
+ maxindex = x;
+ }
+ }
+
+ /* set output to local max */
+ op[y] = maxval;
+ xp[y] = (real)maxindex;
+ }
+ }
+ }
+ else
+ {
+ /* number of batch frames */
+ long nbframe = input->size[0];
+ long i;
+
+ /* resize output */
+ THTensor_(resize3d)(output, nbframe, noframe, framesize);
+
+ /* indices will contain index locations for each output point */
+ THIndexTensor_(resize3d)(indices, nbframe, noframe, framesize);
+
+ /* get raw pointers */
+ input_data = THTensor_(data)(input);
+ output_data = THTensor_(data)(output);
+ indices_data = THIndexTensor_(data)(indices);
+
+ for(i = 0; i < nbframe; i++)
+ {
+ real *inputSample_data = input_data + i*niframe*framesize;
+ real *outputSample_data = output_data + i*noframe*framesize;
+ THIndex_t *indicesSample_data = indices_data + i*noframe*framesize;
+
+ for(t = 0; t < noframe; t++)
+ {
+ real *ip = inputSample_data + t*framesize*dW;
+ real *op = outputSample_data + t*framesize;
+ THIndex_t *xp = indicesSample_data + t*framesize;
+
+#pragma omp parallel for private(y)
+ for(y = 0; y < framesize; y++)
+ {
+ /* compute local max: */
+ long maxindex = -1;
+ real maxval = -THInf;
+ long x;
+ for(x = 0; x < kW; x++)
+ {
+ real val = ip[x*framesize+y];
+ if (val > maxval)
+ {
+ maxval = val;
+ maxindex = x;
+ }
+ }
+
+ /* set output to local max */
+ op[y] = maxval;
+ xp[y] = (real)maxindex;
+ }
+ }
+ }
+ }
+
+ /* cleanup */
+ THTensor_(free)(input);
+
+}
+
+void THNN_(TemporalMaxPooling_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THIndexTensor *indices,
+ int kW,
+ int dW)
+{
+ long niframe;
+ int noframe;
+ long framesize;
+
+ real *gradInput_data;
+ real *gradOutput_data;
+ THIndex_t *indices_data;
+
+ long t, y;
+
+ THNN_(TemporalMaxPooling_shapeCheck)(state, input, gradOutput, indices, kW, dW);
+ /* get contiguous gradOutput */
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+
+ /* resize and zero */
+ THTensor_(resizeAs)(gradInput, input);
+ THTensor_(zero)(gradInput);
+
+ int dimS = 0; // sequence dimension
+ int dimF = 1; // feature dimension
+
+ if (input->nDimension == 3)
+ {
+ dimS = 1;
+ dimF = 2;
+ }
+ /* sizes */
+ niframe = input->size[dimS];
+ noframe = gradOutput->size[dimS];
+ framesize = gradOutput->size[dimF];
+
+ /* get raw pointers */
+ gradInput_data = THTensor_(data)(gradInput);
+ gradOutput_data = THTensor_(data)(gradOutput);
+ indices_data = THIndexTensor_(data)(indices);
+
+ if (input->nDimension == 2)
+ {
+ for(t = 0; t < noframe; t++)
+ {
+ real *gip = gradInput_data + t*framesize*dW;
+ real *gop = gradOutput_data + t*framesize;
+ THIndex_t *xp = indices_data + t*framesize;
+#pragma omp parallel for private(y)
+ for(y = 0; y < framesize; y++)
+ {
+ /* compute local max: */
+ long maxindex = (long)xp[y];
+ if (maxindex != -1)
+ gip[maxindex*framesize+y] += gop[y];
+ }
+ }
+ }
+ else
+ {
+ /* number of batch frames */
+ long nbframe = input->size[0];
+ long i;
+
+ for(i = 0; i < nbframe; i++)
+ {
+ real *gradInputSample_data = gradInput_data + i*niframe*framesize;
+ real *gradOutputSample_data = gradOutput_data + i*noframe*framesize;
+ THIndex_t *indicesSample_data = indices_data + i*noframe*framesize;
+
+ for(t = 0; t < noframe; t++)
+ {
+ real *gip = gradInputSample_data + t*framesize*dW;
+ real *gop = gradOutputSample_data + t*framesize;
+ THIndex_t *xp = indicesSample_data + t*framesize;
+#pragma omp parallel for private(y)
+ for(y = 0; y < framesize; y++)
+ {
+ /* compute local max: */
+ long maxindex = (long)xp[y];
+ if (maxindex != -1)
+ gip[maxindex*framesize+y] += gop[y];
+ }
+ }
+ }
+ }
+
+ /* cleanup */
+ THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/TemporalRowConvolution.c b/contrib/lua-torch/nn/lib/THNN/generic/TemporalRowConvolution.c
new file mode 100644
index 000000000..e3ae41e22
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/TemporalRowConvolution.c
@@ -0,0 +1,472 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/TemporalRowConvolution.c"
+#else
+
+static inline void THNN_(TemporalRowConvolution_shapeCheck)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *weight,
+ THTensor *bias,
+ int kW,
+ int dW,
+ int padW) {
+
+ THArgCheck(kW > 0, 5,
+ "kernel size should be greater than zero, but got kW: %d", kW);
+ THArgCheck(dW > 0, 6,
+ "stride should be greater than zero, but got dW: %d", dW);
+ THNN_ARGCHECK(weight->nDimension == 3, 3, weight,
+ "3D weight tensor expected, but got: %s");
+ THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous");
+ THArgCheck(!bias || THTensor_(isContiguous)(bias), 5, "bias must be contiguous");
+
+ if (bias != NULL) {
+ THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]);
+ }
+
+ // we're always looking at (possibly batch) x feats x seq
+ int ndim = input->nDimension;
+ int dimF = 0;
+ int dimS = 1;
+
+ if (ndim == 3) {
+ ++dimS;
+ ++dimF;
+ }
+
+ THNN_ARGCHECK(ndim == 2 || ndim == 3, 1, input,
+ "2D or 3D (batch mode) input tensor expected, but got :%s");
+
+ long inputFrameSize = weight->size[0];
+ long nInputFrame = input->size[dimS];
+ long nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
+
+ if (nOutputFrame < 1) {
+ THError("Given input size: (%d x %d). "
+ "Calculated output size: (%d x %d). Output size is too small",
+ inputFrameSize, nInputFrame, inputFrameSize, nOutputFrame);
+ }
+
+ THNN_CHECK_DIM_SIZE(input, ndim, dimF, inputFrameSize);
+
+ if (gradOutput != NULL) {
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimF, inputFrameSize);
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimS, nOutputFrame);
+ }
+}
+
+static void THNN_(unfolded_acc_row)(
+ THTensor *finput,
+ THTensor *input,
+ int kW,
+ int dW,
+ int padW,
+ long inputFrameSize,
+ long nInputFrame,
+ long nOutputFrame) {
+
+ size_t c;
+ real *input_data = THTensor_(data)(input);
+ real *finput_data = THTensor_(data)(finput);
+
+// #pragma omp parallel for private(c)
+ for (c = 0; c < inputFrameSize; c++) {
+ size_t kw, x;
+ long long ix = 0;
+
+ for (kw = 0; kw < kW; kw++) {
+ real *src = finput_data
+ + c * (kW * nOutputFrame)
+ + kw * (nOutputFrame);
+ real *dst = input_data + c * (nInputFrame);
+
+ ix = (long long)(kw);
+ if (dW == 1) {
+ real *dst_slice = dst + (size_t)(ix);
+ THVector_(cadd)(dst_slice, dst_slice, src, 1, nOutputFrame);
+ } else {
+ for (x = 0; x < nOutputFrame; x++) {
+ real *dst_slice = dst + (size_t)(ix + x * dW);
+ THVector_(cadd)(dst_slice, dst_slice,
+ src + (size_t)(x), 1, 1);
+ }
+ }
+ }
+ }
+}
+
+static void THNN_(unfolded_copy_row)(
+ THTensor *finput,
+ THTensor *input,
+ int kW,
+ int dW,
+ int padW,
+ long inputFrameSize,
+ long nInputFrame,
+ long nOutputFrame) {
+
+ long k;
+ real *input_data = THTensor_(data)(input);
+ real *finput_data = THTensor_(data)(finput);
+
+// #pragma omp parallel for private(k)
+ for (k = 0; k < inputFrameSize * kW; k++) {
+ size_t c = k / kW;
+ size_t rest = k % kW;
+ size_t kw = rest % kW;
+ size_t x;
+ long long ix;
+ real *dst = finput_data + c * (kW * nOutputFrame) + kw * (nOutputFrame);
+ real *src = input_data + c * (nInputFrame);
+
+ ix = (long long)(kw);
+ if (dW == 1) {
+ memcpy(dst, src+(size_t)(ix), sizeof(real) * (nOutputFrame));
+ } else {
+ for (x = 0; x < nOutputFrame; x++) {
+ memcpy(dst + (size_t)(x), src + (size_t)(ix + x * dW),
+ sizeof(real) * 1);
+ }
+ }
+ }
+}
+
+static void THNN_(TemporalRowConvolution_updateOutput_frame)(
+ THTensor *input,
+ THTensor *output,
+ THTensor *weight,
+ THTensor *bias,
+ THTensor *finput,
+ int kW,
+ int dW,
+ int padW,
+ long inputFrameSize,
+ long nInputFrame,
+ long nOutputFrame) {
+
+ long i;
+
+ THTensor *output3d = THTensor_(newWithStorage3d)(
+ output->storage, output->storageOffset,
+ inputFrameSize, -1,
+ 1, -1,
+ nOutputFrame, -1);
+
+ THNN_(unfolded_copy_row)(finput, input, kW, dW, padW,
+ inputFrameSize, nInputFrame, nOutputFrame);
+
+ THTensor_(zero)(output);
+
+ if (bias != NULL) {
+ for (i = 0; i < inputFrameSize; i++)
+ THVector_(fill)
+ (output->storage->data + output->storageOffset
+ + output->stride[0] * i,
+ THTensor_(get1d)(bias, i), nOutputFrame);
+ }
+
+ THTensor_(baddbmm)(output3d, 1, output3d, 1, weight, finput);
+
+ THTensor_(free)(output3d);
+}
+
+void THNN_(TemporalRowConvolution_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THTensor *weight,
+ THTensor *bias,
+ THTensor *finput,
+ THTensor *fgradInput, // unused here but needed for Cuda
+ int kW,
+ int dW,
+ int padW,
+ bool featFirst) {
+
+ int ndim = input->nDimension;
+
+ THTensor *tinput;
+ if (!featFirst) {
+ tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2);
+ input = THTensor_(newContiguous)(tinput);
+ } else {
+ input = THTensor_(newContiguous)(input);
+ }
+
+ THNN_(TemporalRowConvolution_shapeCheck)(
+ state, input, NULL, weight, bias, kW, dW, padW);
+
+ long inputFrameSize = weight->size[0];
+ long nInputFrame = input->size[ndim - 1];
+ long nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
+
+ if (ndim == 2) { /* non-batch mode */
+
+ THTensor_(resize3d)(finput, inputFrameSize, kW, nOutputFrame);
+ THTensor_(resize2d)(output, inputFrameSize, nOutputFrame);
+
+ THTensor_(zero)(finput);
+ THTensor_(zero)(output);
+
+ THNN_(TemporalRowConvolution_updateOutput_frame)
+ (input, output, weight, bias, finput,
+ kW, dW, padW,
+ inputFrameSize, nInputFrame, nOutputFrame);
+
+ } else {
+ long T = input->size[0];
+ long t;
+
+ THTensor_(resize4d)(finput, T, inputFrameSize, kW, nOutputFrame);
+ THTensor_(resize3d)(output, T, inputFrameSize, nOutputFrame);
+
+ THTensor_(zero)(finput);
+ THTensor_(zero)(output);
+
+#pragma omp parallel for private(t)
+ for (t = 0; t < T; t++) {
+ THTensor *input_t = THTensor_(newSelect)(input, 0, t);
+ THTensor *output_t = THTensor_(newSelect)(output, 0, t);
+ THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+ THNN_(TemporalRowConvolution_updateOutput_frame)
+ (input_t, output_t, weight, bias, finput_t,
+ kW, dW, padW, inputFrameSize, nInputFrame, nOutputFrame);
+
+ THTensor_(free)(input_t);
+ THTensor_(free)(output_t);
+ THTensor_(free)(finput_t);
+ }
+ }
+
+ if (!featFirst) { // NOTE: output will NOT be contiguous in this case
+ THTensor_(transpose)(output, output, ndim - 1, ndim - 2);
+ THTensor_(free)(tinput);
+ }
+
+ THTensor_(free)(input);
+}
+
+static void THNN_(TemporalRowConvolution_updateGradInput_frame)(
+ THTensor *gradInput,
+ THTensor *gradOutput,
+ THTensor *weight,
+ THTensor *fgradInput,
+ int kW,
+ int dW,
+ int padW,
+ long inputFrameSize,
+ long nInputFrame,
+ long nOutputFrame) {
+
+ THTensor *gradOutput3d = THTensor_(newWithStorage3d)(
+ gradOutput->storage, gradOutput->storageOffset,
+ inputFrameSize, -1,
+ 1, -1,
+ nOutputFrame, -1);
+
+ // weight: inputFrameSize x kW x 1
+ // gradOutput3d: inputFrameSize x 1 x nOutputFrame
+ THTensor_(baddbmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput3d);
+ // fgradInput: inputFrameSize x kW x nOutputFrame
+ THTensor_(free)(gradOutput3d);
+
+ THTensor_(zero)(gradInput);
+
+ THNN_(unfolded_acc_row)(fgradInput, gradInput,
+ kW, dW, padW,
+ inputFrameSize, nInputFrame, nOutputFrame);
+}
+
+void THNN_(TemporalRowConvolution_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *weight,
+ THTensor *finput,
+ THTensor *fgradInput,
+ int kW,
+ int dW,
+ int padW,
+ bool featFirst) {
+
+ int ndim = input->nDimension;
+
+ THTensor *tinput, *tgradOutput;
+
+ if (!featFirst) {
+ tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2);
+ tgradOutput = THTensor_(newTranspose)(gradOutput, ndim - 1, ndim - 2);
+
+ input = THTensor_(newContiguous)(tinput);
+ gradOutput = THTensor_(newContiguous)(tgradOutput);
+
+ } else {
+ input = THTensor_(newContiguous)(input);
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+ }
+
+ THNN_(TemporalRowConvolution_shapeCheck)(state, input, gradOutput, weight,
+ NULL, kW, dW, padW);
+
+ long inputFrameSize = weight->size[0];
+ long nInputFrame = input->size[ndim - 1];
+ long nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
+
+ THTensor_(resizeAs)(fgradInput, finput);
+ THTensor_(resizeAs)(gradInput, input);
+
+ THTensor_(zero)(fgradInput);
+ THTensor_(zero)(gradInput);
+
+ THTensor *tweight = THTensor_(new)();
+ THTensor_(transpose)(tweight, weight, 1, 2);
+
+ if (ndim == 2) {
+ THNN_(TemporalRowConvolution_updateGradInput_frame)
+ (gradInput, gradOutput, tweight, fgradInput,
+ kW, dW, padW,
+ inputFrameSize, nInputFrame, nOutputFrame);
+ } else {
+ long T = input->size[0];
+ long t;
+
+#pragma omp parallel for private(t)
+ for (t = 0; t < T; t++) {
+
+ THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
+ THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+ THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
+
+ THNN_(TemporalRowConvolution_updateGradInput_frame)
+ (gradInput_t, gradOutput_t, tweight, fgradInput_t,
+ kW, dW, padW,
+ inputFrameSize, nInputFrame, nOutputFrame);
+
+ THTensor_(free)(gradInput_t);
+ THTensor_(free)(gradOutput_t);
+ THTensor_(free)(fgradInput_t);
+ }
+ }
+
+ THTensor_(free)(tweight);
+
+ if (!featFirst) { // NOTE: gradInput will NOT be contiguous in this case
+
+ THTensor_(free)(tinput);
+ THTensor_(free)(tgradOutput);
+
+ THTensor_(transpose)(gradInput, gradInput, ndim - 1, ndim - 2);
+ }
+
+ THTensor_(free)(input);
+ THTensor_(free)(gradOutput);
+
+}
+
+static void THNN_(TemporalRowConvolution_accGradParameters_frame)(
+ THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias,
+ THTensor *finput, real scale) {
+
+ long i;
+ THTensor *gradOutput3d = THTensor_(newWithStorage3d)(
+ gradOutput->storage, gradOutput->storageOffset,
+ gradOutput->size[0], -1,
+ 1, -1,
+ gradOutput->size[1], -1);
+
+ THTensor *tfinput = THTensor_(new)();
+ THTensor_(transpose)(tfinput, finput, 1, 2);
+ // gradOutput3d: inputFrameSize x 1 x nOutputFrame
+ // finput: inputFrameSize x nOutputFrame x kW
+ THTensor_(baddbmm)(gradWeight, 1, gradWeight, scale, gradOutput3d, tfinput);
+ // gradWeight: inputFrameSize x 1 x kW
+ THTensor_(free)(tfinput);
+
+ if (gradBias != NULL) {
+ for (i = 0; i < gradBias->size[0]; i++) {
+ long k;
+ real sum = 0;
+ real *data = gradOutput3d->storage->data
+ + gradOutput3d->storageOffset
+ + i * gradOutput3d->stride[0];
+ for (k = 0; k < gradOutput3d->size[2]; k++) {
+ sum += data[k];
+ }
+ (gradBias->storage->data + gradBias->storageOffset)[i]
+ += scale * sum;
+ }
+ }
+
+ THTensor_(free)(gradOutput3d);
+
+}
+
+void THNN_(TemporalRowConvolution_accGradParameters)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradWeight,
+ THTensor *gradBias,
+ THTensor *finput,
+ THTensor *fgradInput,
+ int kW,
+ int dW,
+ int padW,
+ bool featFirst,
+ accreal scale_) {
+
+ real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+ int ndim = input->nDimension;
+
+ THTensor *tinput, *tgradOutput;
+
+ if (!featFirst) {
+ tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2);
+ tgradOutput = THTensor_(newTranspose)(gradOutput, ndim - 1, ndim - 2);
+
+ input = THTensor_(newContiguous)(tinput);
+ gradOutput = THTensor_(newContiguous)(tgradOutput);
+ } else {
+ input = THTensor_(newContiguous)(input);
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+ }
+
+ THNN_(TemporalRowConvolution_shapeCheck)
+ (state, input, gradOutput, gradWeight, gradBias, kW, dW, padW);
+
+ long inputFrameSize = gradWeight->size[0];
+ long nInputFrame = input->size[ndim - 1];
+ long nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1;
+
+ if (ndim == 2) {
+ THNN_(TemporalRowConvolution_accGradParameters_frame)(
+ gradOutput, gradWeight, gradBias, finput, scale);
+ } else {
+ long T = input->size[0];
+ long t;
+
+ for (t = 0; t < T; t++) {
+ THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+ THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+ THNN_(TemporalRowConvolution_accGradParameters_frame)(
+ gradOutput_t, gradWeight, gradBias, finput_t, scale);
+
+ THTensor_(free)(gradOutput_t);
+ THTensor_(free)(finput_t);
+ }
+ }
+
+ if (!featFirst) {
+ THTensor_(free)(tinput);
+ THTensor_(free)(tgradOutput);
+ }
+
+ THTensor_(free)(input);
+ THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/TemporalSubSampling.c b/contrib/lua-torch/nn/lib/THNN/generic/TemporalSubSampling.c
new file mode 100644
index 000000000..68f35e28a
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/TemporalSubSampling.c
@@ -0,0 +1,156 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/TemporalSubSampling.c"
+#else
+
+static inline void THNN_(TemporalSubSampling_shapeCheck)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ int kW,
+ int dW,
+ int *inputFrameSize) {
+ int nInputFrame, nOutputFrame;
+
+ THArgCheck(kW > 0, 6,
+ "kernel size should be greater than zero, but got kW: %d", kW);
+ THArgCheck(dW > 0, 7,
+ "stride should be greater than zero, but got dW: %d", dW);
+
+ THNN_ARGCHECK(input->nDimension == 2, 2, input,
+ "2D or 3D (batch mode) tensor expected for input, but got: %s");
+ if (inputFrameSize != NULL) {
+ THArgCheck( input->size[1] == *inputFrameSize, 2,
+ "invalid input frame size. Got: %d, Expected: %d",
+ input->size[1], *inputFrameSize);
+ }
+ THArgCheck( input->size[0] >= kW, 2,
+ "input sequence smaller than kernel size. Got %d, Expected: %d",
+ input->size[0], kW);
+
+ nInputFrame = input->size[0];
+ nOutputFrame = (nInputFrame - kW) / dW + 1;
+
+ if (gradOutput != NULL) {
+ THNN_CHECK_DIM_SIZE(gradOutput, input->nDimension, 0, nOutputFrame);
+ if (inputFrameSize != NULL) {
+ THNN_CHECK_DIM_SIZE(gradOutput, input->nDimension, 1, *inputFrameSize);
+ }
+ }
+}
+
+void THNN_(TemporalSubSampling_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THTensor *weight,
+ THTensor *bias,
+ int kW,
+ int dW,
+ int inputFrameSize)
+{
+ THTensor *outputFrame, *inputWindow;
+ int nInputFrame, nOutputFrame;
+ long k;
+
+ THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous");
+ THArgCheck(!bias || THTensor_(isContiguous)(bias), 4, "bias must be contiguous");
+ THNN_(TemporalSubSampling_shapeCheck)(state, input, NULL, kW, dW, &inputFrameSize);
+
+ outputFrame = THTensor_(new)();
+ inputWindow = THTensor_(new)();
+
+ nInputFrame = input->size[0];
+ nOutputFrame = (nInputFrame - kW) / dW + 1;
+
+ THTensor_(resize2d)(output,
+ nOutputFrame,
+ inputFrameSize);
+
+ for(k = 0; k < nOutputFrame; k++)
+ {
+ THTensor_(narrow)(inputWindow, input, 0, k*dW, kW);
+ THTensor_(select)(outputFrame, output, 0, k);
+ THTensor_(sum)(outputFrame, inputWindow, 0, 1);
+ THTensor_(cmul)(outputFrame, outputFrame, weight);
+ THTensor_(cadd)(outputFrame, outputFrame, 1, bias);
+ }
+
+ THTensor_(free)(outputFrame);
+ THTensor_(free)(inputWindow);
+}
+
+void THNN_(TemporalSubSampling_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *weight,
+ int kW,
+ int dW)
+{
+
+ THTensor *gradOutputFrame;
+ THTensor *gradInputWindow, *buffer, *kwunit;
+ long k;
+
+ THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous");
+ THNN_(TemporalSubSampling_shapeCheck)(state, input, gradOutput, kW, dW, NULL);
+
+ gradOutputFrame = THTensor_(new)();
+ gradInputWindow = THTensor_(new)();
+ buffer = THTensor_(new)();
+ kwunit = THTensor_(newWithSize1d)(kW);
+
+ THTensor_(fill)(kwunit, 1);
+ THTensor_(resizeAs)(gradInput, input);
+ THTensor_(zero)(gradInput);
+
+ for(k = 0; k < gradOutput->size[0]; k++)
+ {
+ THTensor_(narrow)(gradInputWindow, gradInput, 0, k*dW, kW);
+ THTensor_(select)(gradOutputFrame, gradOutput, 0, k);
+ THTensor_(cmul)(buffer, weight, gradOutputFrame);
+ THTensor_(addr)(gradInputWindow, 1, gradInputWindow, 1, kwunit, buffer);
+ }
+
+ THTensor_(free)(gradOutputFrame);
+ THTensor_(free)(gradInputWindow);
+ THTensor_(free)(buffer);
+ THTensor_(free)(kwunit);
+}
+
+void THNN_(TemporalSubSampling_accGradParameters)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradWeight,
+ THTensor *gradBias,
+ int kW,
+ int dW,
+ accreal scale_)
+{
+ real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+ THTensor *gradOutputFrame;
+ THTensor *inputWindow, *buffer;
+ long k;
+
+ THNN_(TemporalSubSampling_shapeCheck)(state, input, gradOutput, kW, dW, NULL);
+ gradOutputFrame = THTensor_(new)();
+ inputWindow = THTensor_(new)();
+ buffer = THTensor_(new)();
+
+ for(k = 0; k < gradOutput->size[0]; k++)
+ {
+ THTensor_(narrow)(inputWindow, input, 0, k*dW, kW);
+ THTensor_(select)(gradOutputFrame, gradOutput, 0, k);
+ THTensor_(sum)(buffer, inputWindow, 0, 1);
+ THTensor_(addcmul)(gradWeight, gradWeight, scale, buffer, gradOutputFrame);
+ THTensor_(cadd)(gradBias, gradBias, scale, gradOutputFrame);
+ }
+
+ THTensor_(free)(gradOutputFrame);
+ THTensor_(free)(inputWindow);
+ THTensor_(free)(buffer);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/Threshold.c b/contrib/lua-torch/nn/lib/THNN/generic/Threshold.c
new file mode 100644
index 000000000..949c7a07c
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/Threshold.c
@@ -0,0 +1,64 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/Threshold.c"
+#else
+
+void THNN_(Threshold_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ accreal threshold_,
+ accreal val_,
+ bool inplace)
+{
+ real threshold = TH_CONVERT_ACCREAL_TO_REAL(threshold_);
+ real val = TH_CONVERT_ACCREAL_TO_REAL(val_);
+ if (inplace)
+ {
+ TH_TENSOR_APPLY(real, input,
+ if (*input_data <= threshold)
+ *input_data = val;
+ );
+ THTensor_(set)(output, input);
+ }
+ else
+ {
+ THTensor_(resizeAs)(output, input);
+ TH_TENSOR_APPLY2(real, output, real, input,
+ *output_data = (*input_data > threshold) ? *input_data : val;
+ );
+ }
+}
+
+void THNN_(Threshold_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ accreal threshold_,
+ accreal val_,
+ bool inplace)
+{
+ real threshold = TH_CONVERT_ACCREAL_TO_REAL(threshold_);
+ real val = TH_CONVERT_ACCREAL_TO_REAL(val_);
+ THNN_CHECK_NELEMENT(input, gradOutput);
+ if (inplace)
+ {
+ TH_TENSOR_APPLY2(real, gradOutput, real, input,
+ if ((*input_data) <= threshold)
+ *gradOutput_data = 0;
+ );
+ THTensor_(set)(gradInput, gradOutput);
+ }
+ else
+ {
+ THTensor_(resizeAs)(gradInput, input);
+ TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input,
+ if ((*input_data) > threshold)
+ *gradInput_data = *gradOutput_data;
+ else
+ *gradInput_data = 0;
+ );
+ }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricAveragePooling.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricAveragePooling.c
new file mode 100644
index 000000000..91c870e6f
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricAveragePooling.c
@@ -0,0 +1,373 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricAveragePooling.c"
+#else
+
+static inline void THNN_(VolumetricAveragePooling_shapeCheck)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ int kT,
+ int kW,
+ int kH,
+ int dT,
+ int dW,
+ int dH) {
+ long nslices;
+ long itime;
+ long iheight;
+ long iwidth;
+ long otime;
+ long oheight;
+ long owidth;
+ int ndim = input->nDimension;
+ int dimN = 0;
+ int dimt = 1;
+ int dimh = 2;
+ int dimw = 3;
+
+ if (input->nDimension == 5)
+ {
+ dimN++;
+ dimt++;
+ dimh++;
+ dimw++;
+ }
+
+ THArgCheck(kT > 0 && kW > 0 && kH > 0, 5,
+ "kernel size should be greater than zero, but got kT: %d kH: %d kW: %d",
+ kT, kH, kW);
+ THArgCheck(dT > 0 && dW > 0 && dH > 0, 8,
+ "stride should be greater than zero, but got dT: %d dH: %d dW: %d",
+ dT, dH, dW);
+ THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+ "4D or 5D (batch mode) tensor expected for input, but got: %s");
+
+ THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH
+ && input->size[dimt] >= kT, 2,
+ "input image (T: %d H: %d W: %d) smaller than "
+ "kernel size (kT: %d kH: %d kW: %d)",
+ input->size[dimt], input->size[dimh], input->size[dimw],
+ kT, kH, kW);
+
+ /* sizes */
+ nslices = input->size[dimN];
+ itime = input->size[dimt];
+ iheight = input->size[dimh];
+ iwidth = input->size[dimw];
+ otime = (itime - kT) / dT + 1;
+ oheight = (iheight - kH) / dH + 1;
+ owidth = (iwidth - kW) / dW + 1;
+
+ if (gradOutput != NULL) {
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimN, nslices);
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimt, otime);
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, oheight);
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, owidth);
+ }
+}
+
+static void THNN_(VolumetricAveragePooling_updateOutput_frame)(
+ real *input_p,
+ real *output_p,
+ long nslices,
+ long itime,
+ long iwidth,
+ long iheight,
+ long otime,
+ long owidth,
+ long oheight,
+ int kT,
+ int kW,
+ int kH,
+ int dT,
+ int dW,
+ int dH)
+{
+ long k;
+#pragma omp parallel for private(k)
+ for (k = 0; k < nslices; k++)
+ {
+ /* loop over output */
+ long i, j, ti;
+ for (ti = 0; ti < otime; ti++)
+ {
+ for (i = 0; i < oheight; i++)
+ {
+ for (j = 0; j < owidth; j++)
+ {
+ /* local pointers */
+ real *ip = input_p + k * itime * iwidth * iheight
+ + ti * iwidth * iheight * dT + i * iwidth * dH + j * dW;
+ real *op = output_p + k * otime * owidth * oheight
+ + ti * owidth * oheight + i * owidth + j;
+
+ /* compute local sum: */
+ real sum = 0.0;
+ int x, y, z;
+
+ for (z=0; z < kT; z++)
+ {
+ for (y = 0; y < kH; y++)
+ {
+ for (x = 0; x < kW; x++)
+ {
+ sum += *(ip + z * iwidth * iheight + y * iwidth + x);
+ }
+ }
+ }
+
+ /* set output to local max */
+ *op = sum / (kT * kW * kH);
+ }
+ }
+ }
+ }
+}
+
+void THNN_(VolumetricAveragePooling_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ int kT,
+ int kW,
+ int kH,
+ int dT,
+ int dW,
+ int dH)
+{
+ long nslices;
+ long itime;
+ long iheight;
+ long iwidth;
+ long otime;
+ long oheight;
+ long owidth;
+ real *input_data;
+ real *output_data;
+
+ THNN_(VolumetricAveragePooling_shapeCheck)(
+ state, input, NULL, kT, kW, kH,
+ dT, dW, dH);
+
+ int dimN = 0;
+ int dimt = 1;
+ int dimh = 2;
+ int dimw = 3;
+
+ if (input->nDimension == 5)
+ {
+ dimN++;
+ dimt++;
+ dimh++;
+ dimw++;
+ }
+
+ /* sizes */
+ nslices = input->size[dimN];
+ itime = input->size[dimt];
+ iheight = input->size[dimh];
+ iwidth = input->size[dimw];
+ otime = (itime - kT) / dT + 1;
+ oheight = (iheight - kH) / dH + 1;
+ owidth = (iwidth - kW) / dW + 1;
+
+ /* get contiguous input */
+ input = THTensor_(newContiguous)(input);
+
+ if (input->nDimension == 4) /* non-batch mode */
+ {
+ /* resize output */
+ THTensor_(resize4d)(output, nslices, otime, oheight, owidth);
+
+ input_data = THTensor_(data)(input);
+ output_data = THTensor_(data)(output);
+
+ THNN_(VolumetricAveragePooling_updateOutput_frame)(
+ input_data, output_data, nslices,
+ itime, iwidth, iheight,
+ otime, owidth, oheight,
+ kT, kW, kH,
+ dT, dW, dH
+ );
+ }
+ else /* batch mode */
+ {
+ long p;
+ long nBatch = input->size[0];
+
+ long istride = nslices * itime * iwidth * iheight;
+ long ostride = nslices * otime * owidth * oheight;
+
+ /* resize output */
+ THTensor_(resize5d)(output, nBatch, nslices, otime, oheight, owidth);
+
+ input_data = THTensor_(data)(input);
+ output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(p)
+ for (p=0; p < nBatch; p++)
+ {
+ THNN_(VolumetricAveragePooling_updateOutput_frame)(
+ input_data + p * istride, output_data + p * ostride, nslices,
+ itime, iwidth, iheight,
+ otime, owidth, oheight,
+ kT, kW, kH,
+ dT, dW, dH
+ );
+ }
+ }
+
+ /* cleanup */
+ THTensor_(free)(input);
+}
+
+static void THNN_(VolumetricAveragePooling_updateGradInput_frame)(
+ real *gradInput_p,
+ real *gradOutput_p,
+ long nslices,
+ long itime,
+ long iwidth,
+ long iheight,
+ long otime,
+ long owidth,
+ long oheight,
+ int kT,
+ int kW,
+ int kH,
+ int dT,
+ int dW,
+ int dH)
+{
+ long k;
+#pragma omp parallel for private(k)
+ for (k = 0; k < nslices; k++)
+ {
+ /* loop over output */
+ long i, j, ti;
+ for (ti = 0; ti < otime; ti++)
+ {
+ for (i = 0; i < oheight; i++)
+ {
+ for (j = 0; j < owidth; j++)
+ {
+ /* local pointers */
+ real *ip = gradInput_p + k * itime * iwidth * iheight
+ + ti * iwidth * iheight * dT + i * iwidth * dH + j * dW;
+ real *op = gradOutput_p + k * otime * owidth * oheight
+ + ti * owidth * oheight + i * owidth + j;
+
+ /* scatter gradients out to footprint: */
+ real val = *op / (kT * kW * kH);
+ int x,y,z;
+ for (z=0; z < kT; z++)
+ {
+ for (y = 0; y < kH; y++)
+ {
+ for (x = 0; x < kW; x++)
+ {
+ *(ip + z * iwidth * iheight + y * iwidth + x) += val;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+void THNN_(VolumetricAveragePooling_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ int kT,
+ int kW,
+ int kH,
+ int dT,
+ int dW,
+ int dH)
+{
+ int nslices;
+ int itime;
+ int iheight;
+ int iwidth;
+ int otime;
+ int oheight;
+ int owidth;
+ real *gradInput_data;
+ real *gradOutput_data;
+
+ int dimN = 0;
+ int dimt = 1;
+ int dimh = 2;
+ int dimw = 3;
+
+ THNN_(VolumetricAveragePooling_shapeCheck)(
+ state, input, gradOutput, kT, kW, kH,
+ dT, dW, dH);
+
+ /* get contiguous gradOutput */
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+
+ /* resize */
+ THTensor_(resizeAs)(gradInput, input);
+ THTensor_(zero)(gradInput);
+
+ if (input->nDimension == 5)
+ {
+ dimN++;
+ dimt++;
+ dimh++;
+ dimw++;
+ }
+
+ /* sizes */
+ nslices = input->size[dimN];
+ itime = input->size[dimt];
+ iheight = input->size[dimh];
+ iwidth = input->size[dimw];
+ otime = gradOutput->size[dimt];
+ oheight = gradOutput->size[dimh];
+ owidth = gradOutput->size[dimw];
+
+ /* get raw pointers */
+ gradInput_data = THTensor_(data)(gradInput);
+ gradOutput_data = THTensor_(data)(gradOutput);
+
+ /* backprop */
+ if (input->nDimension == 4) /* non-batch mode*/
+ {
+ THNN_(VolumetricAveragePooling_updateGradInput_frame)(
+ gradInput_data, gradOutput_data, nslices,
+ itime, iwidth, iheight,
+ otime, owidth, oheight,
+ kT, kW, kH,
+ dT, dW, dH
+ );
+ }
+ else /* batch mode */
+ {
+ long p;
+ long nBatch = input->size[0];
+
+ long istride = nslices * itime * iwidth * iheight;
+ long ostride = nslices * otime * owidth * oheight;
+
+#pragma omp parallel for private(p)
+ for (p = 0; p < nBatch; p++)
+ {
+ THNN_(VolumetricAveragePooling_updateGradInput_frame)(
+ gradInput_data + p * istride, gradOutput_data + p * ostride, nslices,
+ itime, iwidth, iheight,
+ otime, owidth, oheight,
+ kT, kW, kH,
+ dT, dW, dH
+ );
+ }
+ }
+
+ /* cleanup */
+ THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricConvolution.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricConvolution.c
new file mode 100644
index 000000000..be1aa82e6
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricConvolution.c
@@ -0,0 +1,260 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricConvolution.c"
+#else
+
+void THNN_(VolumetricConvolution_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THTensor *weight,
+ THTensor *bias,
+ THTensor *finput, // only used by cuda impl
+ THTensor *fgradInput, // only used by cuda impl
+ int dT,
+ int dW,
+ int dH,
+ int pT,
+ int pW,
+ int pH)
+{
+ THArgCheck(pT != 0 || pW != 0 || pH != 0, 9, "padding not supported by CPU backend"); // sharing signature with CUDA version
+
+ THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+ "4D or 5D (batch mode) tensor expected for input, but got: %s");
+
+ int dimt = 1;
+ int dimh = 2;
+ int dimw = 3;
+
+ if (input->nDimension == 5)
+ {
+ dimt++;
+ dimh++;
+ dimw++;
+ }
+
+ long nOutputPlane = weight->size[0];
+ long kT = weight->size[2];
+ long kH = weight->size[3];
+ long kW = weight->size[4];
+ long inputDepth = input->size[dimt];
+ long inputHeight = input->size[dimh];
+ long inputWidth = input->size[dimw];
+ long outputDepth = (inputDepth - kT) / dT + 1;
+ long outputWidth = (inputWidth - kW) / dW + 1;
+ long outputHeight = (inputHeight - kH) / dH + 1;
+ THTensor *outn = THTensor_(new)();
+ long i, j;
+ if (input->nDimension == 4) /* non-batch mode */
+ {
+ THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth);
+
+ /* add bias */
+ if (bias) {
+ for (i = 0; i < bias->size[0]; i++)
+ {
+ THTensor_(select)(outn, output, 0, i);
+ THTensor_(fill)(outn, THTensor_(get1d)(bias, i));
+ }
+ } else {
+ THTensor_(zero)(output);
+ }
+
+ /* do convolutions */
+ THTensor_(conv3Dmv)(output, 1.0, 1.0, input, weight, dT, dH, dW, "V", "X");
+ }
+ else /* batch mode */
+ {
+ long nBatch = input->size[0];
+ THTensor_(resize5d)(output, nBatch, nOutputPlane, outputDepth, outputHeight, outputWidth);
+ THTensor *inb = THTensor_(new)();
+ THTensor *outb = THTensor_(new)();
+
+ /* loop over batches */
+ for (j = 0; j < nBatch; j++)
+ {
+ THTensor_(select)(inb, input, 0, j);
+ THTensor_(select)(outb, output, 0, j);
+
+ /* add bias */
+ if (bias) {
+ for (i = 0; i < bias->size[0]; i++)
+ {
+ THTensor_(select)(outn, outb, 0, i);
+ THTensor_(fill)(outn, THTensor_(get1d)(bias, i));
+ }
+ } else {
+ THTensor_(zero)(outb);
+ }
+
+ /* do convolutions */
+ THTensor_(conv3Dmv)(outb, 1.0, 1.0, inb, weight, dT, dH, dW, "V", "X");
+ }
+
+ THTensor_(free)(inb);
+ THTensor_(free)(outb);
+ }
+ THTensor_(free)(outn);
+}
+
+void THNN_(VolumetricConvolution_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *weight,
+ THTensor *finput, // only used by cuda impl
+ int dT,
+ int dW,
+ int dH,
+ int pT,
+ int pW,
+ int pH)
+{
+ THArgCheck(pT != 0 || pW != 0 || pH != 0, 9, "padding not supported by CPU backend"); // sharing signature with CUDA version
+
+ THNN_ARGCHECK(weight->nDimension == 5, 4, weight,
+ "5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
+ "expected for weight, but got: %s");
+
+ int nOutputPlane = (int)weight->size[0];
+
+ THNN_ARGCHECK(gradOutput->nDimension == 4 || gradOutput->nDimension == 5, 3,
+ gradOutput,
+ "4D or 5D (batch mode) tensor expected for gradOutput, but got: %s");
+
+ int dimPlane = 0;
+ if (gradOutput->nDimension == 5)
+ {
+ dimPlane++;
+ }
+
+ THArgCheck(nOutputPlane == gradOutput->size[dimPlane], 1,
+ "Number of output features is not equal to nOutputPlane"
+ );
+
+ /* gradient to input */
+ THTensor *tweight = THTensor_(newTranspose)(weight, 0, 1);
+ if (gradOutput->nDimension == 4) /* non-batch mode */
+ {
+ THTensor_(conv3Dmv)(gradInput, 0.0, 1.0, gradOutput, tweight, dT, dH, dW, "F", "C");
+ }
+ else /* batch mode */
+ {
+ long nBatch = gradOutput->size[0];
+ THTensor *ginpb = THTensor_(new)();
+ THTensor *goutb = THTensor_(new)();
+ long j;
+
+ THTensor_(resize5d)(gradInput,
+ input->size[0], input->size[1], input->size[2], input->size[3], input->size[4]
+ );
+
+ /* loop over batches */
+ for (j = 0; j < nBatch; j++)
+ {
+ THTensor_(select)(ginpb, gradInput, 0, j);
+ THTensor_(select)(goutb, gradOutput, 0, j);
+ THTensor_(conv3Dmv)(ginpb, 0.0, 1.0, goutb, tweight, dT, dH, dW, "F", "C");
+ }
+ THTensor_(free)(ginpb);
+ THTensor_(free)(goutb);
+ }
+
+ THTensor_(free)(tweight);
+}
+
+void THNN_(VolumetricConvolution_accGradParameters)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradWeight,
+ THTensor *gradBias,
+ THTensor *finput, // only used by cuda impl
+ THTensor *fgradInput, // only used by cuda impl
+ int dT,
+ int dW,
+ int dH,
+ int pT,
+ int pW,
+ int pH,
+ accreal scale_)
+{
+ real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+ THArgCheck(pT != 0 || pW != 0 || pH != 0, 9, "padding not supported by CPU backend"); // sharing signature with CUDA version
+
+ THNN_ARGCHECK(gradWeight->nDimension == 5, 4, gradWeight,
+ "5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
+ "expected for gradWeight, but got: %s");
+
+ int nOutputPlane = (int)gradWeight->size[0];
+ if (gradBias) {
+ THArgCheck(gradBias->nDimension == 1 && gradBias->size[0] == nOutputPlane, 5,
+ "gradBias tensor has wrong size"
+ );
+ }
+
+ long k;
+ real *gradBias_data;
+ THTensor *gradOutSlice;
+ int dimPlane = 0;
+ if (gradOutput->nDimension == 5)
+ {
+ dimPlane++;
+ }
+
+ THArgCheck(nOutputPlane == gradOutput->size[dimPlane], 1,
+ "Number of output features is not equal to nOutputPlane"
+ );
+
+ if (gradOutput->nDimension == 4) /* non-batch mode */
+ {
+ /* gradient to bias */
+ if (gradBias) {
+ gradBias_data = THTensor_(data)(gradBias);
+ gradOutSlice = THTensor_(new)();
+ for (k = 0; k < nOutputPlane; k++)
+ {
+ THTensor_(select)(gradOutSlice, gradOutput, 0, k);
+ gradBias_data[k] += scale * THTensor_(sumall)(gradOutSlice);
+ }
+ THTensor_(free)(gradOutSlice);
+ }
+
+ /* gradient to kernels */
+ THTensor_(conv3DRevger)(gradWeight, 1.0, scale, input, gradOutput, dT, dH, dW);
+ }
+ else /* batch mode */
+ {
+ long nBatch = gradOutput->size[0];
+ THTensor *inpb = THTensor_(new)();
+ THTensor *goutb = THTensor_(new)();
+ long j;
+
+ /* loop over batches */
+ for (j = 0; j < nBatch; j++)
+ {
+ THTensor_(select)(inpb, input, 0, j);
+ THTensor_(select)(goutb, gradOutput, 0, j);
+
+ /* gradient to bias */
+ if (gradBias) {
+ gradBias_data = THTensor_(data)(gradBias);
+ gradOutSlice = THTensor_(new)();
+ for (k = 0; k < nOutputPlane; k++)
+ {
+ THTensor_(select)(gradOutSlice, goutb, 0, k);
+ gradBias_data[k] += scale * THTensor_(sumall)(gradOutSlice);
+ }
+ THTensor_(free)(gradOutSlice);
+ }
+
+ /* gradient to kernels */
+ THTensor_(conv3DRevger)(gradWeight, 1.0, scale, inpb, goutb, dT, dH, dW);
+ }
+ THTensor_(free)(inpb);
+ THTensor_(free)(goutb);
+ }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricConvolutionMM.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricConvolutionMM.c
new file mode 100644
index 000000000..00a121db6
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricConvolutionMM.c
@@ -0,0 +1,628 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricConvolutionMM.c"
+#else
+
+static void inline THNN_(VolumetricConvolutionMM_shapeCheck)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *weight,
+ THTensor *bias,
+ int kT,
+ int kW,
+ int kH,
+ int dT,
+ int dW,
+ int dH,
+ int pT,
+ int pW,
+ int pH) {
+ THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+ "4D or 5D (batch mode) tensor expected for input, but got: %s");
+ THArgCheck(kT > 0 && kW > 0 && kH > 0, 8,
+ "kernel size should be greater than zero, but got kT: %d kH: %d kW: %d", kT, kH, kW);
+ THArgCheck(dT > 0 && dW > 0 && dH > 0, 11,
+ "stride should be greater than zero, but got dT: %d dH: %d dW: %d", dT, dH, dW);
+
+ int ndim = input->nDimension;
+ int dimf = 0;
+ int dimt = 1;
+ int dimh = 2;
+ int dimw = 3;
+
+ if (ndim == 5)
+ {
+ dimf++;
+ dimt++;
+ dimh++;
+ dimw++;
+ }
+
+ long nInputPlane;
+ long inputDepth;
+ long inputHeight;
+ long inputWidth;
+ long nOutputPlane;
+ long outputDepth;
+ long outputHeight;
+ long outputWidth;
+
+ nInputPlane = input->size[dimf];
+ inputDepth = input->size[dimt];
+ inputHeight = input->size[dimh];
+ inputWidth = input->size[dimw];
+ nOutputPlane = weight->size[0];
+ outputDepth = (inputDepth + 2*pT - kT) / dT + 1;
+ outputHeight = (inputHeight + 2*pH - kH) / dH + 1;
+ outputWidth = (inputWidth + 2*pW - kW) / dW + 1;
+
+ if (outputWidth < 1 || outputHeight < 1 || outputDepth < 1)
+ {
+ THError(
+ "Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
+ nInputPlane, inputDepth, inputHeight, inputWidth,
+ nOutputPlane, outputDepth, outputHeight, outputWidth
+ );
+ }
+
+ THArgCheck(weight->nDimension == 2 || weight->nDimension == 5, 4,
+ "weight tensor should be 2D or 5D - got %d", weight->nDimension);
+
+ if (bias != NULL) {
+ THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]);
+ }
+
+ THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
+
+ if (gradOutput != NULL) {
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimt, outputDepth);
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+ }
+}
+
+static int THNN_(view_weight)(THTensor **_weight)
+{
+ THTensor *weight = *_weight;
+ if (weight->nDimension == 5) {
+ long s1 = weight->size[0];
+ long s2 = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4];
+ *_weight = THTensor_(newWithStorage2d)(weight->storage, weight->storageOffset, s1, -1, s2, -1);
+ return 1;
+ }
+ return 0;
+}
+
+/* note: due to write issues, this one cannot be parallelized as well as unfolded_copy */
+static void THNN_(unfolded_acc_vol)(
+ THTensor *finput,
+ THTensor *input,
+ int kT,
+ int kW,
+ int kH,
+ int dT,
+ int dW,
+ int dH,
+ int pT,
+ int pW,
+ int pH,
+ int nInputPlane,
+ int inputDepth,
+ int inputWidth,
+ int inputHeight,
+ int outputDepth,
+ int outputWidth,
+ int outputHeight)
+{
+ int nip;
+ real *input_data = THTensor_(data)(input);
+ real *finput_data = THTensor_(data)(finput);
+
+//#pragma omp parallel for private(nip)
+ for (nip = 0; nip < nInputPlane; nip++)
+ {
+ int kt, kw, kh, t, y, x, it, ix, iy;
+ for (kt = 0; kt < kT; kt++)
+ {
+ for (kh = 0; kh < kH; kh++)
+ {
+ for (kw = 0; kw < kW; kw++)
+ {
+ real *src = finput_data
+ + nip * (kT*kH*kW*outputDepth*outputHeight*outputWidth)
+ + kt * (kH*kW*outputDepth*outputHeight*outputWidth)
+ + kh * (kW*outputDepth*outputHeight*outputWidth)
+ + kw * (outputDepth*outputHeight*outputWidth);
+
+ real *dst = input_data + nip*(inputDepth*inputHeight*inputWidth);
+ if (pT > 0 || pH > 0 || pW > 0)
+ {
+ for (t = 0; t < outputDepth; t++)
+ {
+ it = t*dT - pT + kt;
+ for (y = 0; y < outputHeight; y++)
+ {
+ iy = y*dH - pH + kh;
+ for (x = 0; x < outputWidth; x++)
+ {
+ ix = x*dW - pW + kw;
+ if (it < 0 || it >= inputDepth || iy < 0 || iy >= inputHeight || ix < 0 || ix >= inputWidth)
+ {
+ }
+ else
+ {
+ real *dst_slice = dst+it*inputHeight*inputWidth+iy*inputWidth+ix;
+ THVector_(cadd)(dst_slice, dst_slice, src+t*outputHeight*outputWidth+y*outputWidth+x, 1, 1);
+ }
+ }
+ }
+ }
+ }
+ else
+ {
+ for (t = 0; t < outputDepth; t++)
+ {
+ it = t*dT + kt;
+ for (y = 0; y < outputHeight; y++)
+ {
+ iy = y*dH + kh;
+ for(x = 0; x < outputWidth; x++)
+ {
+ ix = x*dW + kw;
+ real *dst_slice = dst+it*inputHeight*inputWidth+iy*inputWidth+ix;
+ THVector_(cadd)(dst_slice, dst_slice, src+t*outputHeight*outputWidth+y*outputWidth+x, 1, 1);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+static void THNN_(unfolded_copy_vol)(
+ THTensor *finput,
+ THTensor *input,
+ int kT,
+ int kW,
+ int kH,
+ int dT,
+ int dW,
+ int dH,
+ int pT,
+ int pW,
+ int pH,
+ int nInputPlane,
+ int inputDepth,
+ int inputWidth,
+ int inputHeight,
+ int outputDepth,
+ int outputWidth,
+ int outputHeight)
+{
+ long k;
+ real *input_data = THTensor_(data)(input);
+ real *finput_data = THTensor_(data)(finput);
+// #pragma omp parallel for private(k)
+ for (k = 0; k < nInputPlane*kT*kH*kW; k++)
+ {
+ int nip = k / (kT*kH*kW);
+ int rest = k % (kT*kH*kW);
+ int kt = rest / (kH*kW);
+ rest = rest % (kH*kW);
+ int kh = rest / kW;
+ int kw = rest % kW;
+ int t,x,y,it,ix,iy;
+ real *dst = finput_data
+ + nip * (kT*kH*kW*outputDepth*outputHeight*outputWidth)
+ + kt * (kH*kW*outputDepth*outputHeight*outputWidth)
+ + kh * (kW*outputDepth*outputHeight*outputWidth)
+ + kw * (outputDepth*outputHeight*outputWidth);
+ real *src = input_data + nip*(inputDepth*inputHeight*inputWidth);
+
+ if (pT > 0 || pH > 0 || pW > 0)
+ {
+ for (t = 0; t < outputDepth; t++)
+ {
+ it = t*dT - pT + kt;
+ for (y = 0; y < outputHeight; y++)
+ {
+ iy = y*dH - pH + kh;
+ for (x = 0; x < outputWidth; x++)
+ {
+ ix = x*dW - pW + kw;
+ if (it < 0 || it >= inputDepth || iy < 0 || iy >= inputHeight || ix < 0 || ix >= inputWidth)
+ memset(dst+t*outputHeight*outputWidth+y*outputWidth+x, 0, sizeof(real)*(1));
+ else
+ memcpy(dst+t*outputHeight*outputWidth+y*outputWidth+x, src+it*inputHeight*inputWidth+iy*inputWidth+ix, sizeof(real)*(1));
+ }
+ }
+ }
+ }
+ else
+ {
+ for (t = 0; t < outputDepth; t++)
+ {
+ it = t*dT + kt;
+ for (y = 0; y < outputHeight; y++)
+ {
+ iy = y*dH + kh;
+ for(x = 0; x < outputWidth; x++)
+ {
+ ix = x*dW + kw;
+ memcpy(dst+t*outputHeight*outputWidth+y*outputWidth+x, src+it*inputHeight*inputWidth+iy*inputWidth+ix, sizeof(real)*(1));
+ }
+ }
+ }
+ }
+ }
+}
+
+static void THNN_(VolumetricConvolutionMM_updateOutput_frame)(
+ THTensor *input,
+ THTensor *output,
+ THTensor *weight,
+ THTensor *bias,
+ THTensor *finput,
+ int kT,
+ int kW,
+ int kH,
+ int dT,
+ int dW,
+ int dH,
+ int pT,
+ int pW,
+ int pH,
+ long nInputPlane,
+ long inputDepth,
+ long inputWidth,
+ long inputHeight,
+ long nOutputPlane,
+ long outputDepth,
+ long outputWidth,
+ long outputHeight)
+{
+ long i;
+ THTensor *output2d;
+
+ THNN_(unfolded_copy_vol)(
+ finput, input,
+ kT, kW, kH,
+ dT, dW, dH,
+ pT, pW, pH,
+ nInputPlane,
+ inputDepth, inputWidth, inputHeight,
+ outputDepth, outputWidth, outputHeight
+ );
+
+ output2d = THTensor_(newWithStorage2d)(
+ output->storage, output->storageOffset, nOutputPlane, -1,
+ outputDepth*outputHeight*outputWidth, -1
+ );
+
+ if (bias) {
+ for (i = 0; i < nOutputPlane; i++)
+ {
+ THVector_(fill)(
+ output->storage->data+output->storageOffset+output->stride[0]*i,
+ THTensor_(get1d)(bias, i),
+ outputDepth*outputHeight*outputWidth
+ );
+ }
+ } else {
+ THTensor_(zero)(output);
+ }
+
+ THTensor_(addmm)(output2d, 1, output2d, 1, weight, finput);
+
+ THTensor_(free)(output2d);
+}
+
+void THNN_(VolumetricConvolutionMM_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THTensor *weight,
+ THTensor *bias,
+ THTensor *finput,
+ int kT,
+ int kW,
+ int kH,
+ int dT,
+ int dW,
+ int dH,
+ int pT,
+ int pW,
+ int pH)
+{
+ int dimf = 0;
+ int dimt = 1;
+ int dimh = 2;
+ int dimw = 3;
+ int freeWeight = 0;
+
+ long nInputPlane;
+ long inputDepth;
+ long inputHeight;
+ long inputWidth;
+ long nOutputPlane;
+ long outputDepth;
+ long outputHeight;
+ long outputWidth;
+
+ THNN_(VolumetricConvolutionMM_shapeCheck)(
+ state, input, NULL, weight, bias,
+ kT, kW, kH, dT, dW, dH, pT, pW, pH);
+ input = THTensor_(newContiguous)(input);
+
+ if (input->nDimension == 5)
+ {
+ dimf++;
+ dimt++;
+ dimh++;
+ dimw++;
+ }
+
+ nInputPlane = input->size[dimf];
+ inputDepth = input->size[dimt];
+ inputHeight = input->size[dimh];
+ inputWidth = input->size[dimw];
+ nOutputPlane = weight->size[0];
+ outputDepth = (inputDepth + 2*pT - kT) / dT + 1;
+ outputHeight = (inputHeight + 2*pH - kH) / dH + 1;
+ outputWidth = (inputWidth + 2*pW - kW) / dW + 1;
+
+ freeWeight = THNN_(view_weight)(&weight);
+
+ if (input->nDimension == 4)
+ {
+ THTensor_(resize2d)(finput, kT*kW*kH*nInputPlane, outputDepth*outputHeight*outputWidth);
+ THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth);
+
+ THNN_(VolumetricConvolutionMM_updateOutput_frame)(
+ input, output, weight, bias, finput,
+ kT, kW, kH,
+ dT, dW, dH,
+ pT, pW, pH,
+ nInputPlane, inputDepth, inputWidth, inputHeight,
+ nOutputPlane, outputDepth, outputWidth, outputHeight
+ );
+ }
+ else
+ {
+ long T = input->size[0];
+ long t;
+
+ THTensor_(resize3d)(finput, T, kT*kW*kH*nInputPlane, outputDepth*outputHeight*outputWidth);
+ THTensor_(resize5d)(output, T, nOutputPlane, outputDepth, outputHeight, outputWidth);
+
+// #pragma omp parallel for private(t)
+ for (t = 0; t < T; t++)
+ {
+ THTensor *input_t = THTensor_(newSelect)(input, 0, t);
+ THTensor *output_t = THTensor_(newSelect)(output, 0, t);
+ THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+ THNN_(VolumetricConvolutionMM_updateOutput_frame)(
+ input_t, output_t, weight, bias, finput_t,
+ kT, kW, kH,
+ dT, dW, dH,
+ pT, pW, pH,
+ nInputPlane, inputDepth, inputWidth, inputHeight,
+ nOutputPlane, outputDepth, outputWidth, outputHeight
+ );
+
+ THTensor_(free)(input_t);
+ THTensor_(free)(output_t);
+ THTensor_(free)(finput_t);
+ }
+ }
+
+ THTensor_(free)(input);
+ if (freeWeight)
+ THTensor_(free)(weight);
+}
+
+static void THNN_(VolumetricConvolutionMM_updateGradInput_frame)(
+ THTensor *gradInput,
+ THTensor *gradOutput,
+ THTensor *weight,
+ THTensor *fgradInput,
+ int kT,
+ int kW,
+ int kH,
+ int dT,
+ int dW,
+ int dH,
+ int pT,
+ int pW,
+ int pH)
+{
+ THTensor *gradOutput2d = THTensor_(newWithStorage2d)(
+ gradOutput->storage, gradOutput->storageOffset,
+ gradOutput->size[0], -1,
+ gradOutput->size[1]*gradOutput->size[2]*gradOutput->size[3], -1
+ );
+
+ THTensor_(addmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput2d);
+ THTensor_(free)(gradOutput2d);
+
+ THTensor_(zero)(gradInput);
+
+ THNN_(unfolded_acc_vol)(
+ fgradInput, gradInput,
+ kT, kW, kH,
+ dT, dW, dH,
+ pT, pW, pH,
+ gradInput->size[0], gradInput->size[1], gradInput->size[3], gradInput->size[2],
+ gradOutput->size[1], gradOutput->size[3], gradOutput->size[2]
+ );
+}
+
+void THNN_(VolumetricConvolutionMM_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *weight,
+ THTensor *finput,
+ THTensor *fgradInput,
+ int kT,
+ int kW,
+ int kH,
+ int dT,
+ int dW,
+ int dH,
+ int pT,
+ int pW,
+ int pH)
+{
+ int nOutputPlane = (int)weight->size[0];
+
+ THNN_(VolumetricConvolutionMM_shapeCheck)(
+ state, input, gradOutput, weight, NULL,
+ kT, kW, kH, dT, dW, dH, pT, pW, pH);
+ input = THTensor_(newContiguous)(input);
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+
+ int freeWeight = THNN_(view_weight)(&weight);
+
+ THTensor_(resizeAs)(gradInput, input);
+ THTensor_(resizeAs)(fgradInput, finput);
+ // depending on the BLAS library, fgradInput (result tensor) might
+ // be left uninitialized on zero alpha, which might lead to weird behavior
+ // hence, to be safe, zero it
+ THTensor_(zero)(fgradInput);
+ THTensor *tweight = THTensor_(new)();
+ THTensor_(transpose)(tweight, weight, 0, 1);
+
+ if (input->nDimension == 4)
+ {
+ THNN_(VolumetricConvolutionMM_updateGradInput_frame)(
+ gradInput, gradOutput, tweight, fgradInput,
+ kT, kW, kH,
+ dT, dW, dH,
+ pT, pW, pH
+ );
+ }
+ else
+ {
+ long T = input->size[0];
+ long t;
+
+//#pragma omp parallel for private(t)
+ for (t = 0; t < T; t++)
+ {
+ THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t);
+ THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+ THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t);
+
+ THNN_(VolumetricConvolutionMM_updateGradInput_frame)(
+ gradInput_t, gradOutput_t, tweight, fgradInput_t,
+ kT, kW, kH,
+ dT, dW, dH,
+ pT, pW, pH
+ );
+
+ THTensor_(free)(gradInput_t);
+ THTensor_(free)(gradOutput_t);
+ THTensor_(free)(fgradInput_t);
+ }
+ }
+
+ THTensor_(free)(tweight);
+ THTensor_(free)(input);
+ THTensor_(free)(gradOutput);
+ if (freeWeight)
+ THTensor_(free)(weight);
+}
+
+static void THNN_(VolumetricConvolutionMM_accGradParameters_frame)(
+ THTensor *gradOutput,
+ THTensor *gradWeight,
+ THTensor *gradBias,
+ THTensor *finput,
+ real scale)
+{
+ long i;
+ THTensor *gradOutput2d = THTensor_(newWithStorage2d)(
+ gradOutput->storage, gradOutput->storageOffset,
+ gradOutput->size[0], -1,
+ gradOutput->size[1]*gradOutput->size[2]*gradOutput->size[3], -1
+ );
+
+ THTensor *tfinput = THTensor_(new)();
+ THTensor_(transpose)(tfinput, finput, 0, 1);
+ THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutput2d, tfinput);
+ THTensor_(free)(tfinput);
+
+ if (gradBias) {
+ for (i = 0; i < gradBias->size[0]; i++)
+ {
+ long k;
+ real sum = 0;
+ real *data = gradOutput2d->storage->data + gradOutput2d->storageOffset + i*gradOutput2d->stride[0];
+ for (k = 0; k < gradOutput2d->size[1]; k++)
+ sum += data[k];
+
+ (gradBias->storage->data + gradBias->storageOffset)[i] += scale * sum;
+ }
+ }
+
+ THTensor_(free)(gradOutput2d);
+}
+
+void THNN_(VolumetricConvolutionMM_accGradParameters)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradWeight,
+ THTensor *gradBias,
+ THTensor *finput,
+ int kT, int kW, int kH,
+ int dT, int dW, int dH,
+ int pT, int pW, int pH,
+ accreal scale_)
+{
+ real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+ int freeWeight;
+ int nOutputPlane = (int)gradWeight->size[0];
+
+ THNN_(VolumetricConvolutionMM_shapeCheck)(
+ state, input, gradOutput, gradWeight, gradBias,
+ kT, kW, kH, dT, dW, dH, pT, pW, pH);
+ input = THTensor_(newContiguous)(input);
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+
+ freeWeight = THNN_(view_weight)(&gradWeight);
+
+ if (input->nDimension == 4) // non-batch mode
+ {
+ THNN_(VolumetricConvolutionMM_accGradParameters_frame)(gradOutput, gradWeight, gradBias, finput, scale);
+ }
+ else // batch mode
+ {
+ long T = input->size[0];
+ long t;
+
+ for (t = 0; t < T; t++)
+ {
+ THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t);
+ THTensor *finput_t = THTensor_(newSelect)(finput, 0, t);
+
+ THNN_(VolumetricConvolutionMM_accGradParameters_frame)(gradOutput_t, gradWeight, gradBias, finput_t, scale);
+
+ THTensor_(free)(gradOutput_t);
+ THTensor_(free)(finput_t);
+ }
+ }
+
+ THTensor_(free)(input);
+ THTensor_(free)(gradOutput);
+ if (freeWeight)
+ THTensor_(free)(gradWeight);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricDilatedConvolution.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricDilatedConvolution.c
new file mode 100644
index 000000000..ca740f78e
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricDilatedConvolution.c
@@ -0,0 +1,420 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricDilatedConvolution.c"
+#else
+
+static inline void THNN_(VolumetricDilatedConvolution_shapeCheck)(
+ THTensor *input, THTensor *gradOutput,
+ THTensor *weight, THTensor *bias,
+ int kT, int kH, int kW, int dT, int dH, int dW,
+ int padT, int padH, int padW,
+ int dilationT, int dilationH, int dilationW) {
+ THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+ "4D or 5D (batch mode) tensor expected for input, but got: %s");
+ THNN_ARGCHECK(weight->nDimension == 5, 4, weight,
+ "5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
+ "expected for weight, but got: %s");
+ THArgCheck(kT > 0 && kW > 0 && kH > 0, 8,
+ "kernel size should be greater than zero, but got kT: %d kH: %d kW: %d", kT, kH, kW);
+ THArgCheck(dT > 0 && dW > 0 && dH > 0, 11,
+ "stride should be greater than zero, but got dT: %d dH: %d dW: %d", dT, dH, dW);
+ THArgCheck(dilationT > 0 && dilationW > 0 && dilationH > 0, 15,
+ "dilation should be greater than zero, but got dilationT: %d, dilationH: %d, dilationW: %d",
+ dilationT, dilationH, dilationW);
+ if (bias != NULL) {
+ THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]);
+ }
+
+ // Params
+ int ndim = input->nDimension;
+ int nInputPlane = weight->size[1];
+ int nOutputPlane = weight->size[0];
+ int dimf = 0;
+ int dimd = 1;
+ int dimh = 2;
+ int dimw = 3;
+
+ if (ndim == 5) {
+ dimf++;
+ dimd++;
+ dimh++;
+ dimw++;
+ }
+
+ long inputDepth = input->size[dimd];
+ long inputHeight = input->size[dimh];
+ long inputWidth = input->size[dimw];
+ long outputDepth = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
+ long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+ long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+ if (outputDepth < 1 || outputWidth < 1 || outputHeight < 1)
+ THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
+ nInputPlane,inputDepth,inputHeight,inputWidth,nOutputPlane,outputDepth,outputHeight,outputWidth);
+
+ THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
+
+ if (gradOutput != NULL) {
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimd, outputDepth);
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+ }
+}
+
+void THNN_(VolumetricDilatedConvolution_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THTensor *weight,
+ THTensor *bias,
+ THTensor *columns,
+ THTensor *ones,
+ int kT, int kW, int kH,
+ int dT, int dW, int dH,
+ int padT, int padW, int padH,
+ int dilationT, int dilationW, int dilationH)
+{
+ THNN_(VolumetricDilatedConvolution_shapeCheck)(
+ input, NULL, weight, bias,
+ kT, kH, kW, dT, dH, dW, padT, padH, padW,
+ dilationT, dilationH, dilationW);
+
+ // Params:
+ int nInputPlane = weight->size[1];
+ int nOutputPlane = weight->size[0];
+
+ input = THTensor_(newContiguous)(input);
+ weight = THTensor_(newContiguous)(weight);
+ bias = bias ? THTensor_(newContiguous)(bias) : bias;
+ int batch = 1;
+ if (input->nDimension == 4) {
+ // Force batch
+ batch = 0;
+ THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+ }
+
+ long inputDepth = input->size[2];
+ long inputHeight = input->size[3];
+ long inputWidth = input->size[4];
+ long outputDepth = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
+ long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+ long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+ // Batch size + input planes
+ long batchSize = input->size[0];
+
+ // Resize output
+ THTensor_(resize5d)(output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth);
+ THTensor_(zero)(output);
+
+ // Resize temporary columns
+ THTensor_(resize2d)(columns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth);
+
+ // Define a buffer of ones, for bias accumulation
+ // Note: this buffer can be shared with other modules, it only ever gets increased,
+ // and always contains ones.
+ if (ones->nDimension != 3 ||
+ ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
+ // Resize plane and fill with ones...
+ THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth);
+ THTensor_(fill)(ones, 1);
+ }
+
+ // Helpers
+ THTensor *input_n = THTensor_(new)();
+ THTensor *output_n = THTensor_(new)();
+
+ // For each elt in batch, do:
+ for (int elt = 0; elt < batchSize; elt ++) {
+ // Matrix mulitply per output:
+ THTensor_(select)(input_n, input, 0, elt);
+ THTensor_(select)(output_n, output, 0, elt);
+
+ // Do Bias first:
+ // M,N,K are dims of matrix A and B
+ long m_ = nOutputPlane;
+ long n_ = outputDepth * outputHeight * outputWidth;
+ long k_ = 1;
+
+ // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+ if (bias) {
+ THBlas_(gemm)(
+ 't', 'n',
+ n_, m_, k_,
+ 1,
+ THTensor_(data)(ones), k_,
+ THTensor_(data)(bias), k_,
+ 0,
+ THTensor_(data)(output_n), n_
+ );
+ } else {
+ THTensor_(zero)(output_n);
+ }
+
+ // Extract columns:
+ THNN_(vol2col)(
+ THTensor_(data)(input_n),
+ nInputPlane, inputDepth, inputHeight, inputWidth,
+ kT, kH, kW, padT, padH, padW, dT, dH, dW,
+ dilationT, dilationH, dilationW,
+ THTensor_(data)(columns)
+ );
+
+ // M,N,K are dims of matrix A and B
+ long m = nOutputPlane;
+ long n = columns->size[1];
+ long k = nInputPlane*kT*kH*kW;
+
+ // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+ THBlas_(gemm)(
+ 'n', 'n',
+ n, m, k,
+ 1,
+ THTensor_(data)(columns), n,
+ THTensor_(data)(weight), k,
+ 1,
+ THTensor_(data)(output_n), n
+ );
+ }
+
+ // Free
+ THTensor_(free)(input_n);
+ THTensor_(free)(output_n);
+
+ // Resize output
+ if (batch == 0) {
+ THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth);
+ THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
+ }
+
+ THTensor_(free)(input);
+ THTensor_(free)(weight);
+ if (bias) THTensor_(free)(bias);
+}
+
+void THNN_(VolumetricDilatedConvolution_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *weight,
+ THTensor *gradColumns,
+ int kT, int kW, int kH,
+ int dT, int dW, int dH,
+ int padT, int padW, int padH,
+ int dilationT, int dilationW, int dilationH)
+{
+ THNN_(VolumetricDilatedConvolution_shapeCheck)(
+ input, gradOutput, weight, NULL,
+ kT, kH, kW, dT, dH, dW, padT, padH, padW,
+ dilationT, dilationH, dilationW);
+
+ // Params
+ int nInputPlane = weight->size[1];
+ int nOutputPlane = weight->size[0];
+
+ input = THTensor_(newContiguous)(input);
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+ weight = THTensor_(newContiguous)(weight);
+
+ int batch = 1;
+ if (input->nDimension == 4) {
+ // Force batch
+ batch = 0;
+ THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+ THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+ }
+
+ long inputDepth = input->size[2];
+ long inputWidth = input->size[4];
+ long inputHeight = input->size[3];
+ long outputDepth = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
+ long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+ long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+ // Batch size + input planes
+ long batchSize = input->size[0];
+
+ // Resize output
+ THTensor_(resize5d)(gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth);
+
+ // Resize temporary columns
+ THTensor_(resize2d)(gradColumns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth);
+ THTensor_(zero)(gradColumns);
+
+ // Helpers
+ THTensor *gradInput_n = THTensor_(new)();
+ THTensor *gradOutput_n = THTensor_(new)();
+
+ // For each elt in batch, do:
+ for (int elt = 0; elt < batchSize; elt ++) {
+ // Matrix mulitply per sample:
+ THTensor_(select)(gradInput_n, gradInput, 0, elt);
+ THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+ // M,N,K are dims of matrix A and B
+ long m = nInputPlane*kT*kW*kH;
+ long n = gradColumns->size[1];
+ long k = nOutputPlane;
+
+ // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+ THBlas_(gemm)(
+ 'n', 't',
+ n, m, k,
+ 1,
+ THTensor_(data)(gradOutput_n), n,
+ THTensor_(data)(weight), m,
+ 0,
+ THTensor_(data)(gradColumns), n
+ );
+
+ // Unpack columns back into input:
+ THNN_(col2vol)(
+ THTensor_(data)(gradColumns),
+ nInputPlane, inputDepth, inputHeight, inputWidth,
+ kT, kH, kW, padT, padH, padW, dT, dH, dW,
+ dilationT, dilationH, dilationW,
+ THTensor_(data)(gradInput_n)
+ );
+ }
+
+ // Free
+ THTensor_(free)(gradInput_n);
+ THTensor_(free)(gradOutput_n);
+
+ // Resize output
+ if (batch == 0) {
+ THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
+ THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
+ THTensor_(resize4d)(gradInput, nInputPlane, inputDepth, inputHeight, inputWidth);
+ }
+
+ THTensor_(free)(input);
+ THTensor_(free)(gradOutput);
+ THTensor_(free)(weight);
+}
+
+void THNN_(VolumetricDilatedConvolution_accGradParameters)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradWeight,
+ THTensor *gradBias,
+ THTensor *columns,
+ THTensor *ones,
+ int kT, int kW, int kH,
+ int dT, int dW, int dH,
+ int padT, int padW, int padH,
+ int dilationT, int dilationW, int dilationH,
+ accreal scale_)
+{
+ real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+ THNN_(VolumetricDilatedConvolution_shapeCheck)(
+ input, gradOutput, gradWeight, gradBias,
+ kT, kH, kW, dT, dH, dW, padT, padH, padW,
+ dilationT, dilationH, dilationW);
+
+ // Params
+ int nInputPlane = gradWeight->size[1];
+ int nOutputPlane = gradWeight->size[0];
+
+ input = THTensor_(newContiguous)(input);
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+
+ int batch = 1;
+ if (input->nDimension == 4) {
+ // Force batch
+ batch = 0;
+ THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+ THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+ }
+
+ long inputDepth = input->size[2];
+ long inputWidth = input->size[4];
+ long inputHeight = input->size[3];
+ long outputDepth = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1;
+ long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+ long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+ // Batch size + input planes
+ long batchSize = input->size[0];
+
+ // Define a buffer of ones, for bias accumulation
+ if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) {
+ // Resize plane and fill with ones...
+ THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth);
+ THTensor_(fill)(ones, 1);
+ }
+
+ // Resize temporary columns
+ THTensor_(resize2d)(columns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth);
+
+ // Helpers
+ THTensor *input_n = THTensor_(new)();
+ THTensor *gradOutput_n = THTensor_(new)();
+
+ // For each elt in batch, do:
+ for (int elt = 0; elt < batchSize; elt ++) {
+ // Matrix mulitply per output:
+ THTensor_(select)(input_n, input, 0, elt);
+ THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+ // Extract columns:
+ THNN_(vol2col)(
+ THTensor_(data)(input_n),
+ nInputPlane, inputDepth, inputHeight, inputWidth,
+ kT, kH, kW, padT, padH, padW, dT, dH, dW,
+ dilationT, dilationH, dilationW,
+ THTensor_(data)(columns)
+ );
+
+ // M,N,K are dims of matrix A and B
+ long m = nOutputPlane;
+ long n = nInputPlane*kT*kW*kH;
+ long k = columns->size[1];
+
+ // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+ THBlas_(gemm)(
+ 't', 'n',
+ n, m, k,
+ scale,
+ THTensor_(data)(columns), k,
+ THTensor_(data)(gradOutput_n), k,
+ 1,
+ THTensor_(data)(gradWeight), n
+ );
+
+ // Do Bias:
+ // M,N,K are dims of matrix A and B
+ long m_ = nOutputPlane;
+ long k_ = outputDepth * outputHeight * outputWidth;
+
+ // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+ if (gradBias) {
+ THBlas_(gemv)(
+ 't',
+ k_, m_,
+ scale,
+ THTensor_(data)(gradOutput_n), k_,
+ THTensor_(data)(ones), 1,
+ 1,
+ THTensor_(data)(gradBias), 1
+ );
+ }
+ }
+
+ // Free
+ THTensor_(free)(input_n);
+ THTensor_(free)(gradOutput_n);
+
+ // Resize
+ if (batch == 0) {
+ THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
+ THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
+ }
+
+ THTensor_(free)(input);
+ THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricDilatedMaxPooling.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricDilatedMaxPooling.c
new file mode 100644
index 000000000..66c0f9531
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricDilatedMaxPooling.c
@@ -0,0 +1,515 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricDilatedMaxPooling.c"
+#else
+
+static inline void THNN_(VolumetricDilatedMaxPooling_shapeCheck)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THIndexTensor *indices,
+ int kT, int kW, int kH,
+ int dT, int dW, int dH,
+ int pT, int pW, int pH,
+ int dilationT, int dilationW, int dilationH,
+ bool ceilMode) {
+ int ndim = input->nDimension;
+ int dimN = 0;
+ int dimt = 1;
+ int dimh = 2;
+ int dimw = 3;
+ long nslices;
+ long itime;
+ long iheight;
+ long iwidth;
+ long otime;
+ long oheight;
+ long owidth;
+
+ THArgCheck(kT > 0 && kW > 0 && kH > 0, 5,
+ "kernel size should be greater than zero, but got kT: %d kH: %d kW: %d",
+ kT, kH, kW);
+ THArgCheck(dT > 0 && dW > 0 && dH > 0, 8,
+ "stride should be greater than zero, but got dT: %d dH: %d dW: %d",
+ dT, dH, dW);
+ THArgCheck(dilationT > 0 && dilationW > 0 && dilationH > 0, 14,
+ "dilation should be greater than 0, but got dilationT: %d dilationH: %d dilationW: %d",
+ dilationT, dilationH, dilationW);
+
+ THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+ "4D or 5D (batch mode) tensor expected for input, but got: %s");
+
+ if (input->nDimension == 5)
+ {
+ dimN++;
+ dimt++;
+ dimh++;
+ dimw++;
+ }
+
+ THArgCheck(kT/2 >= pT && kW/2 >= pW && kH/2 >= pH, 2,
+ "pad should be smaller than half of kernel size, but got "
+ "kT: %d kW: %d, kH: %d, padT: %d, padW: %d, padH: %d",
+ kT, kW, kH, pT, pW, pH);
+
+ nslices = input->size[dimN];
+ itime = input->size[dimt];
+ iheight = input->size[dimh];
+ iwidth = input->size[dimw];
+ if (ceilMode)
+ {
+ otime = (int)(ceil((float)(itime - (dilationT * (kT - 1) + 1) + 2*pT) / dT)) + 1;
+ oheight = (int)(ceil((float)(iheight - (dilationH * (kH - 1) + 1) + 2*pH) / dH)) + 1;
+ owidth = (int)(ceil((float)(iwidth - (dilationW * (kW - 1) + 1) + 2*pW) / dW)) + 1;
+ }
+ else
+ {
+ otime = (int)(floor((float)(itime - (dilationT * (kT - 1) + 1) + 2*pT) / dT)) + 1;
+ oheight = (int)(floor((float)(iheight - (dilationH * (kH - 1) + 1) + 2*pH) / dH)) + 1;
+ owidth = (int)(floor((float)(iwidth - (dilationW * (kW - 1) + 1) + 2*pW) / dW)) + 1;
+ }
+
+ if (pT || pW || pH)
+ {
+ // ensure that the last pooling starts inside the image
+ if ((otime - 1)*dT >= itime + pT)
+ --otime;
+ if ((oheight - 1)*dH >= iheight + pH)
+ --oheight;
+ if ((owidth - 1)*dW >= iwidth + pW)
+ --owidth;
+ }
+
+ if (otime < 1 || owidth < 1 || oheight < 1)
+ THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
+ nslices,itime,iheight,iwidth,nslices,otime,oheight,owidth);
+
+ if (gradOutput != NULL) {
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimN, nslices);
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimt, otime);
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, oheight);
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, owidth);
+ }
+ if (indices != NULL) {
+ THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimN, nslices);
+ THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimt, otime);
+ THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimh, oheight);
+ THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimw, owidth);
+ }
+}
+
+static void THNN_(VolumetricDilatedMaxPooling_updateOutput_frame)(
+ real *input_p,
+ real *output_p,
+ THIndex_t *indz_p,
+ long nslices,
+ long itime,
+ long iwidth,
+ long iheight,
+ long otime,
+ long owidth,
+ long oheight,
+ int kT,
+ int kW,
+ int kH,
+ int dT,
+ int dW,
+ int dH,
+ int pT,
+ int pW,
+ int pH,
+ int dilationT,
+ int dilationW,
+ int dilationH)
+{
+ long k;
+#pragma omp parallel for private(k)
+ for (k = 0; k < nslices; k++)
+ {
+ /* loop over output */
+ long i, j, ti;
+ for (ti = 0; ti < otime; ti++)
+ {
+ for (i = 0; i < oheight; i++)
+ {
+ for (j = 0; j < owidth; j++)
+ {
+ /* local pointers */
+
+ long start_t = ti * dT - pT;
+ long start_h = i * dH - pH;
+ long start_w = j * dW - pW;
+
+ long kernel_t = fminf(kT, kT + start_t);
+ long kernel_h = fminf(kH, kH + start_h);
+ long kernel_w = fminf(kW, kW + start_w);
+
+ while(start_t < 0)
+ start_t += dilationT;
+ while(start_h < 0)
+ start_h += dilationH;
+ while(start_w < 0)
+ start_w += dilationW;
+
+ real *ip = input_p + k * itime * iwidth * iheight
+ + start_t * iwidth * iheight + start_h * iwidth + start_w;
+ real *op = output_p + k * otime * owidth * oheight
+ + ti * owidth * oheight + i * owidth + j;
+ THIndex_t *indzp = indz_p + k * otime * owidth * oheight
+ + ti * owidth * oheight + i * owidth + j;
+
+ /* compute local max: */
+ real maxval = -THInf;
+ int x,y,z;
+ int mx, my, mz;
+ mx = my = mz = -1;
+
+ for (z = 0; z < kernel_t; z++)
+ {
+ for (y = 0; y < kernel_h; y++)
+ {
+ for (x = 0; x < kernel_w; x++)
+ {
+ if ((start_t + z * dilationT < itime) && (start_h + y * dilationH < iheight) && (start_w + x * dilationW < iwidth))
+ {
+ real val = *(ip + z * dilationT * iwidth * iheight + y * dilationH * iwidth + x * dilationW);
+ if (val > maxval)
+ {
+ maxval = val;
+ // Store indices w.r.t the kernel dimension
+ mz = z + (kT - kernel_t);
+ my = y + (kH - kernel_h);
+ mx = x + (kW - kernel_w);
+ }
+ }
+ }
+ }
+ }
+
+ // set max values
+ ((unsigned char*)(indzp))[0] = mz;
+ ((unsigned char*)(indzp))[1] = my;
+ ((unsigned char*)(indzp))[2] = mx;
+ ((unsigned char*)(indzp))[3] = 0;
+
+ /* set output to local max */
+ *op = maxval;
+ }
+ }
+ }
+ }
+}
+
+void THNN_(VolumetricDilatedMaxPooling_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THIndexTensor *indices,
+ int kT,
+ int kW,
+ int kH,
+ int dT,
+ int dW,
+ int dH,
+ int pT,
+ int pW,
+ int pH,
+ int dilationT,
+ int dilationW,
+ int dilationH,
+ bool ceilMode)
+{
+ long nslices;
+ long itime;
+ long iheight;
+ long iwidth;
+ long otime;
+ long oheight;
+ long owidth;
+ real *input_data;
+ real *output_data;
+ THIndex_t *indices_data;
+
+
+ int dimN = 0;
+ int dimt = 1;
+ int dimh = 2;
+ int dimw = 3;
+
+ if (input->nDimension == 5)
+ {
+ dimN++;
+ dimt++;
+ dimh++;
+ dimw++;
+ }
+
+ THNN_(VolumetricDilatedMaxPooling_shapeCheck)(
+ state, input, NULL, NULL,
+ kT, kW, kH, dT, dW, dH,
+ pT, pW, pH, dilationT, dilationW, dilationH,
+ ceilMode);
+
+ /* sizes */
+ nslices = input->size[dimN];
+ itime = input->size[dimt];
+ iheight = input->size[dimh];
+ iwidth = input->size[dimw];
+ if (ceilMode)
+ {
+ otime = (int)(ceil((float)(itime - (dilationT * (kT - 1) + 1) + 2*pT) / dT)) + 1;
+ oheight = (int)(ceil((float)(iheight - (dilationH * (kH - 1) + 1) + 2*pH) / dH)) + 1;
+ owidth = (int)(ceil((float)(iwidth - (dilationW * (kW - 1) + 1) + 2*pW) / dW)) + 1;
+ }
+ else
+ {
+ otime = (int)(floor((float)(itime - (dilationT * (kT - 1) + 1) + 2*pT) / dT)) + 1;
+ oheight = (int)(floor((float)(iheight - (dilationH * (kH - 1) + 1) + 2*pH) / dH)) + 1;
+ owidth = (int)(floor((float)(iwidth - (dilationW * (kW - 1) + 1) + 2*pW) / dW)) + 1;
+ }
+
+ if (pT || pW || pH)
+ {
+ // ensure that the last pooling starts inside the image
+ if ((otime - 1)*dT >= itime + pT)
+ --otime;
+ if ((oheight - 1)*dH >= iheight + pH)
+ --oheight;
+ if ((owidth - 1)*dW >= iwidth + pW)
+ --owidth;
+ }
+
+ /* get contiguous input */
+ input = THTensor_(newContiguous)(input);
+
+ if (input->nDimension == 4) /* non-batch mode */
+ {
+ /* resize output */
+ THTensor_(resize4d)(output, nslices, otime, oheight, owidth);
+ /* indices will contain ti,i,j uchar locations packed into float/double */
+ THIndexTensor_(resize4d)(indices, nslices, otime, oheight, owidth);
+
+ input_data = THTensor_(data)(input);
+ output_data = THTensor_(data)(output);
+ indices_data = THIndexTensor_(data)(indices);
+
+ THNN_(VolumetricDilatedMaxPooling_updateOutput_frame)(
+ input_data, output_data,
+ indices_data,
+ nslices,
+ itime, iwidth, iheight,
+ otime, owidth, oheight,
+ kT, kW, kH,
+ dT, dW, dH,
+ pT, pW, pH,
+ dilationT, dilationW, dilationH
+ );
+ }
+ else /* batch mode */
+ {
+ long p;
+ long nBatch = input->size[0];
+
+ long istride = nslices * itime * iwidth * iheight;
+ long ostride = nslices * otime * owidth * oheight;
+
+ /* resize output */
+ THTensor_(resize5d)(output, nBatch, nslices, otime, oheight, owidth);
+ /* indices will contain ti,i,j locations for each output point */
+ THIndexTensor_(resize5d)(indices, nBatch, nslices, otime, oheight, owidth);
+
+ input_data = THTensor_(data)(input);
+ output_data = THTensor_(data)(output);
+ indices_data = THIndexTensor_(data)(indices);
+
+#pragma omp parallel for private(p)
+ for (p=0; p < nBatch; p++)
+ {
+ THNN_(VolumetricDilatedMaxPooling_updateOutput_frame)(
+ input_data + p * istride,
+ output_data + p * ostride,
+ indices_data + p * ostride,
+ nslices,
+ itime, iwidth, iheight,
+ otime, owidth, oheight,
+ kT, kW, kH,
+ dT, dW, dH,
+ pT, pW, pH,
+ dilationT, dilationW, dilationH
+ );
+ }
+ }
+
+ /* cleanup */
+ THTensor_(free)(input);
+}
+
+static void THNN_(VolumetricDilatedMaxPooling_updateGradInput_frame)(
+ real *gradInput_p,
+ real *gradOutput_p,
+ THIndex_t *indz_p,
+ long nslices,
+ long itime,
+ long iwidth,
+ long iheight,
+ long otime,
+ long owidth,
+ long oheight,
+ int dT,
+ int dW,
+ int dH,
+ int pT,
+ int pW,
+ int pH,
+ int dilationT,
+ int dilationW,
+ int dilationH)
+{
+ long k;
+#pragma omp parallel for private(k)
+ for (k = 0; k < nslices; k++)
+ {
+ real *gradInput_p_k = gradInput_p + k * itime * iwidth * iheight;
+ real *gradOutput_p_k = gradOutput_p + k * otime * owidth * oheight;
+ THIndex_t *indz_p_k = indz_p + k * otime * owidth * oheight;
+
+ /* calculate max points */
+ long ti, i, j;
+ for (ti = 0; ti < otime; ti++)
+ {
+ for (i = 0; i < oheight; i++)
+ {
+ for (j = 0; j < owidth; j++)
+ {
+ /* retrieve position of max */
+ THIndex_t * indzp = &indz_p_k[ti * oheight * owidth + i * owidth + j];
+ long maxti = ((unsigned char*)(indzp))[0] * dilationT + ti * dT - pT;
+ long maxi = ((unsigned char*)(indzp))[1] * dilationH + i * dH - pH;
+ long maxj = ((unsigned char*)(indzp))[2] * dilationW + j * dW - pW;
+
+ if (maxti != -1) {
+ /* update gradient */
+ gradInput_p_k[maxti * iheight * iwidth + maxi * iwidth + maxj] +=
+ gradOutput_p_k[ti * oheight * owidth + i * owidth + j];
+ }
+ }
+ }
+ }
+ }
+}
+
+void THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THIndexTensor *indices,
+ int kT,
+ int kW,
+ int kH,
+ int dT,
+ int dW,
+ int dH,
+ int pT,
+ int pW,
+ int pH,
+ int dilationT,
+ int dilationW,
+ int dilationH,
+ bool ceilMode)
+{
+ int nslices;
+ int itime;
+ int iheight;
+ int iwidth;
+ int otime;
+ int oheight;
+ int owidth;
+ real *gradInput_data;
+ real *gradOutput_data;
+ THIndex_t *indices_data;
+
+ int dimN = 0;
+ int dimt = 1;
+ int dimh = 2;
+ int dimw = 3;
+
+ THNN_(VolumetricDilatedMaxPooling_shapeCheck)(
+ state, input, gradOutput, indices,
+ kT, kW, kH, dT, dW, dH,
+ pT, pW, pH, dilationT, dilationW, dilationH,
+ ceilMode);
+
+ // TODO: gradOutput shape check
+ /* get contiguous gradOutput */
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+
+ /* resize */
+ THTensor_(resizeAs)(gradInput, input);
+ THTensor_(zero)(gradInput);
+
+ if (input->nDimension == 5)
+ {
+ dimN++;
+ dimt++;
+ dimh++;
+ dimw++;
+ }
+
+ /* sizes */
+ nslices = input->size[dimN];
+ itime = input->size[dimt];
+ iheight = input->size[dimh];
+ iwidth = input->size[dimw];
+ otime = gradOutput->size[dimt];
+ oheight = gradOutput->size[dimh];
+ owidth = gradOutput->size[dimw];
+
+ /* get raw pointers */
+ gradInput_data = THTensor_(data)(gradInput);
+ gradOutput_data = THTensor_(data)(gradOutput);
+ indices_data = THIndexTensor_(data)(indices);
+
+ /* backprop */
+ if (input->nDimension == 4) /* non-batch mode*/
+ {
+ THNN_(VolumetricDilatedMaxPooling_updateGradInput_frame)(
+ gradInput_data, gradOutput_data,
+ indices_data,
+ nslices,
+ itime, iwidth, iheight,
+ otime, owidth, oheight,
+ dT, dW, dH,
+ pT, pW, pH,
+ dilationT, dilationW, dilationH
+ );
+ }
+ else /* batch mode */
+ {
+ long p;
+ long nBatch = input->size[0];
+
+ long istride = nslices * itime * iwidth * iheight;
+ long ostride = nslices * otime * owidth * oheight;
+
+#pragma omp parallel for private(p)
+ for (p = 0; p < nBatch; p++)
+ {
+ THNN_(VolumetricDilatedMaxPooling_updateGradInput_frame)(
+ gradInput_data + p * istride,
+ gradOutput_data + p * ostride,
+ indices_data + p * ostride,
+ nslices,
+ itime, iwidth, iheight,
+ otime, owidth, oheight,
+ dT, dW, dH,
+ pT, pW, pH,
+ dilationT, dilationW, dilationH
+ );
+ }
+ }
+
+ /* cleanup */
+ THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricFractionalMaxPooling.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricFractionalMaxPooling.c
new file mode 100644
index 000000000..236986bb9
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricFractionalMaxPooling.c
@@ -0,0 +1,279 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricFractionalMaxPooling.c"
+#else
+
+static long* THNN_(VolumetricFractionalMaxPooling_generateIntervals)(
+ real sample,
+ long inputSize,
+ long outputSize,
+ int poolSize) {
+ real alpha = (real) (inputSize - poolSize) / (real) (outputSize - 1);
+ long* sequence = (long*) THAlloc(sizeof(long) * outputSize);
+
+ long i;
+ for (i = 0; i < outputSize - 1; ++i) {
+ sequence[i] =
+ (long) ((i + sample) * alpha) - (long) (sample * alpha);
+ }
+ sequence[outputSize - 1] = inputSize - poolSize;
+
+ return sequence;
+}
+
+static void THNN_(VolumetricFractionalMaxPooling_updateOutput_frame)(
+ real* input,
+ real* output,
+ THIndex_t* indices,
+ real* randomSamples,
+ long numPlanes,
+ long inputT, long inputW, long inputH,
+ long outputT, long outputW, long outputH,
+ int poolSizeT, int poolSizeW, int poolSizeH) {
+ long plane;
+#pragma omp parallel for private(plane)
+ for (plane = 0; plane < numPlanes; ++plane) {
+ /* each plane contains 3 random samples, one for T, one for W, and one for H */
+ real* randomSamplesForPlane = randomSamples + plane * 3;
+
+ /* Generate interval sequence */
+ long* sequenceT =
+ THNN_(VolumetricFractionalMaxPooling_generateIntervals)(
+ randomSamplesForPlane[0], inputT, outputT, poolSizeT);
+ long* sequenceW =
+ THNN_(VolumetricFractionalMaxPooling_generateIntervals)(
+ randomSamplesForPlane[1], inputW, outputW, poolSizeW);
+ long* sequenceH =
+ THNN_(VolumetricFractionalMaxPooling_generateIntervals)(
+ randomSamplesForPlane[2], inputH, outputH, poolSizeH);
+
+ /* loop over output */
+ long h, w, t;
+
+ real* inputForPlane = input + plane * inputT * inputW * inputH;
+ real* outputForPlane = output + plane * outputT * outputW * outputH;
+ THIndex_t* indicesForPlane = indices + plane * outputT * outputW * outputH;
+
+ for (h = 0; h < outputH; ++h) {
+ long inputHStart = sequenceH[h];
+
+ for (w = 0; w < outputW; ++w) {
+ long inputWStart = sequenceW[w];
+
+ for (t = 0; t < outputT; ++t) {
+ long inputTStart = sequenceT[t];
+
+ real maxVal = -THInf;
+ long maxIndex = -1;
+
+ long h2, w2, t2;
+ for (h2 = inputHStart; h2 < inputHStart + poolSizeH; ++h2) {
+ for (w2 = inputWStart; w2 < inputWStart + poolSizeW; ++w2) {
+ for (t2 = inputTStart; t2 < inputTStart + poolSizeT; ++t2) {
+ THAssert(h2 >= 0 && h2 < inputH);
+ THAssert(w2 >= 0 && w2 < inputW);
+ THAssert(t2 >= 0 && t2 < inputT);
+
+ long planeIndex = h2 * inputW * inputT + w2 * inputT + t2;
+ real val = inputForPlane[planeIndex];
+ if (val > maxVal) {
+ maxVal = val;
+ maxIndex = planeIndex;
+ }
+ }
+ }
+ }
+
+ THAssert(maxVal != -THInf);
+ THAssert(maxIndex != -1);
+
+ outputForPlane[h * outputW * outputT + w * outputT + t] = maxVal;
+ /* +1 to lua index */
+ indicesForPlane[h * outputW * outputT + w * outputT + t] = maxIndex + TH_INDEX_BASE;
+ }
+ }
+ }
+
+ THFree(sequenceT);
+ THFree(sequenceW);
+ THFree(sequenceH);
+ }
+}
+
+void THNN_(VolumetricFractionalMaxPooling_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ int outputT, int outputW, int outputH,
+ int poolSizeT, int poolSizeW, int poolSizeH,
+ THIndexTensor *indices,
+ THTensor *randomSamples) {
+
+ long numBatch = 1;
+ int planeDim = 0;
+ int heightDim = 1;
+ int widthDim = 2;
+ int timeDim = 3;
+
+ long numInputDims = THTensor_(nDimension)(input);
+ THNN_ARGCHECK(numInputDims == 4 || numInputDims == 5, 2, input,
+ "4D or 5D (batch mode) tensor expected for input, but got: %s");
+
+ if (numInputDims == 5) {
+ numBatch = THTensor_(size)(input, 0);
+ planeDim++;
+ heightDim++;
+ widthDim++;
+ timeDim++;
+ }
+
+ /* sizes */
+ long numPlanes = THTensor_(size)(input, planeDim);
+ long inputH = THTensor_(size)(input, heightDim);
+ long inputW = THTensor_(size)(input, widthDim);
+ long inputT = THTensor_(size)(input, timeDim);
+
+ THArgCheck(outputH + poolSizeH - 1 < inputH, 9,
+ "poolSizeH (%d) too large relative to input height (%d)",
+ poolSizeH, inputH);
+ THArgCheck(outputW + poolSizeW - 1 < inputW, 8,
+ "poolSizeW (%d) too large relative to input width (%d)",
+ poolSizeW, inputW);
+ THArgCheck(outputT + poolSizeT - 1 < inputT, 7,
+ "poolSizeT (%d) too large relative to input time (%d)",
+ poolSizeT, inputT);
+
+ /* get contiguous input */
+ input = THTensor_(newContiguous)(input);
+
+ if (numInputDims == 4) {
+ /* resize output */
+ THTensor_(resize4d)(output, numPlanes, outputH, outputW, outputT);
+ /* indices will contain the locations for each output point */
+ THIndexTensor_(resize4d)(indices, numPlanes, outputH, outputW, outputT);
+
+ THNN_(VolumetricFractionalMaxPooling_updateOutput_frame)(
+ THTensor_(data)(input),
+ THTensor_(data)(output),
+ THIndexTensor_(data)(indices),
+ THTensor_(data)(randomSamples),
+ numPlanes, inputT, inputW, inputH,
+ outputT, outputW, outputH, poolSizeT, poolSizeW, poolSizeH);
+ } else {
+ THTensor_(resize5d)(output, numBatch, numPlanes, outputH, outputW, outputT);
+ /* indices will contain the locations for each output point */
+ THIndexTensor_(resize5d)(indices, numBatch, numPlanes, outputH, outputW, outputT);
+
+ long batch;
+#pragma omp parallel for private(batch)
+ for (batch = 0; batch < numBatch; ++batch) {
+ THNN_(VolumetricFractionalMaxPooling_updateOutput_frame)(
+ THTensor_(data)(input) + batch * numPlanes * inputH * inputW * inputT,
+ THTensor_(data)(output) + batch * numPlanes * outputH * outputW * outputT,
+ THIndexTensor_(data)(indices) + batch * numPlanes * outputH * outputW * outputT,
+ THTensor_(data)(randomSamples) + batch * numPlanes * 3,
+ numPlanes, inputT, inputW, inputH,
+ outputT, outputW, outputH, poolSizeT, poolSizeW, poolSizeH);
+ }
+ }
+
+ /* cleanup */
+ THTensor_(free)(input);
+}
+
+static void THNN_(VolumetricFractionalMaxPooling_updateGradInput_frame)(
+ real* gradInput,
+ real* gradOutput,
+ THIndex_t* indices,
+ long numPlanes,
+ long inputT, long inputW, long inputH,
+ long outputT, long outputW, long outputH) {
+ long plane;
+#pragma omp parallel for private(plane)
+ for (plane = 0; plane < numPlanes; plane++) {
+ real* gradInputForPlane = gradInput + plane * inputT * inputW * inputH;
+ real* gradOutputForPlane = gradOutput + plane * outputT * outputW * outputH;
+ THIndex_t* indicesForPlane = indices + plane * outputT * outputW * outputH;
+
+ long h, w, t;
+ for (h = 0; h < outputH; ++h) {
+ for (w = 0; w < outputW; ++w) {
+ for (t = 0; t < outputT; ++t) {
+ long outputIndex = h * outputW * outputT + w * outputT + t;
+ long index = indicesForPlane[outputIndex] - TH_INDEX_BASE;
+ THAssert(index >= 0 && index < inputT * inputW * inputH);
+
+ gradInputForPlane[index] += gradOutputForPlane[outputIndex];
+ }
+ }
+ }
+ }
+}
+
+void THNN_(VolumetricFractionalMaxPooling_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ int outputT, int outputW, int outputH,
+ int poolSizeT, int poolSizeW, int poolSizeH,
+ THIndexTensor *indices) {
+
+ long numBatch = 1;
+ int planeDim = 0;
+ int heightDim = 1;
+ int widthDim = 2;
+ int timeDim = 3;
+
+ long numInputDims = THTensor_(nDimension)(input);
+ if (numInputDims == 5) {
+ numBatch = THTensor_(size)(input, 0);
+ planeDim = 1;
+ heightDim++;
+ widthDim++;
+ timeDim++;
+ }
+
+ /* sizes */
+ long numPlanes = THTensor_(size)(input, planeDim);
+ long inputH = THTensor_(size)(input, heightDim);
+ long inputW = THTensor_(size)(input, widthDim);
+ long inputT = THTensor_(size)(input, timeDim);
+
+ THArgCheck(outputT == THTensor_(size)(gradOutput, timeDim), 3,
+ "gradOutput time unexpected");
+ THArgCheck(outputW == THTensor_(size)(gradOutput, widthDim), 3,
+ "gradOutput width unexpected");
+ THArgCheck(outputH == THTensor_(size)(gradOutput, heightDim), 3,
+ "gradOutput height unexpected");
+
+ /* get contiguous gradOutput */
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+
+ /* resize */
+ THTensor_(resizeAs)(gradInput, input);
+ THTensor_(zero)(gradInput);
+
+ /* backprop */
+ if (numInputDims == 4) {
+ THNN_(VolumetricFractionalMaxPooling_updateGradInput_frame)(
+ THTensor_(data)(gradInput),
+ THTensor_(data)(gradOutput),
+ THIndexTensor_(data)(indices),
+ numPlanes, inputT, inputW, inputH, outputT, outputW, outputH);
+ } else {
+ long batch;
+#pragma omp parallel for private(batch)
+ for (batch = 0; batch < numBatch; ++batch) {
+ THNN_(VolumetricFractionalMaxPooling_updateGradInput_frame)(
+ THTensor_(data)(gradInput) + batch * numPlanes * inputH * inputW * inputT,
+ THTensor_(data)(gradOutput) + batch * numPlanes * outputH * outputW * outputT,
+ THIndexTensor_(data)(indices) + batch * numPlanes * outputH * outputW * outputT,
+ numPlanes, inputT, inputW, inputH, outputT, outputW, outputH);
+ }
+ }
+
+ /* cleanup */
+ THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricFullConvolution.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricFullConvolution.c
new file mode 100644
index 000000000..c974fab50
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricFullConvolution.c
@@ -0,0 +1,541 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricFullConvolution.c"
+#else
+
+static void THNN_(vol2col)(
+ const real *data_vol, const int channels,
+ const int depth, const int height, const int width,
+ const int kT, const int kH, const int kW,
+ const int pT, const int pH, const int pW,
+ const int dT, const int dH, const int dW,
+ const int dilationT, const int dilationH, const int dilationW,
+ real *data_col)
+{
+ int c, t, h, w;
+ int depth_col = (depth + 2 * pT - (dilationT * (kT - 1) + 1)) / dT + 1;
+ int height_col = (height + 2 * pH - (dilationH * (kH - 1) + 1)) / dH + 1;
+ int width_col = (width + 2 * pW - (dilationW * (kW - 1) + 1)) / dW + 1;
+ int channels_col = channels * kT * kH * kW;
+ for (c = 0; c < channels_col; ++c)
+ {
+ int w_offset = c % kW;
+ int h_offset = (c / kW) % kH;
+ int t_offset = (c / kW / kH) % kT;
+ int c_vol = c / kT / kH / kW;
+ for (t = 0; t < depth_col; ++t)
+ {
+ for (h = 0; h < height_col; ++h)
+ {
+ for (w = 0; w < width_col; ++w)
+ {
+ int t_pad = t * dT - pT + t_offset * dilationT;
+ int h_pad = h * dH - pH + h_offset * dilationH;
+ int w_pad = w * dW - pW + w_offset * dilationW;
+ if (t_pad >= 0 && t_pad < depth &&
+ h_pad >= 0 && h_pad < height &&
+ w_pad >= 0 && w_pad < width)
+ data_col[((c * depth_col + t) * height_col + h) * width_col + w] =
+ data_vol[((c_vol * depth + t_pad) * height + h_pad) * width + w_pad];
+ else
+ data_col[((c * depth_col + t) * height_col + h) * width_col + w] = 0;
+ }
+ }
+ }
+ }
+}
+
+static void THNN_(col2vol)(
+ const real* data_col, const int channels,
+ const int depth, const int height, const int width,
+ const int kT, const int kH, const int kW,
+ const int pT, const int pH, const int pW,
+ const int dT, const int dH, const int dW,
+ const int dilationT, const int dilationH, const int dilationW,
+ real* data_vol)
+{
+ int c, t, h, w;
+ memset(data_vol, 0, sizeof(real) * depth * height * width * channels);
+ int depth_col = (depth + 2 * pT - (dilationT * (kT - 1) + 1)) / dT + 1;
+ int height_col = (height + 2 * pH - (dilationH * (kH - 1) + 1)) / dH + 1;
+ int width_col = (width + 2 * pW - (dilationW * (kW - 1) + 1)) / dW + 1;
+ int channels_col = channels * kT * kH * kW;
+ for (c = 0; c < channels_col; ++c)
+ {
+ int w_offset = c % kW;
+ int h_offset = (c / kW) % kH;
+ int t_offset = (c / kW / kH) % kT;
+ int c_vol = c / kT / kH / kW;
+ for (t = 0; t < depth_col; ++t)
+ {
+ for (h = 0; h < height_col; ++h)
+ {
+ for (w = 0; w < width_col; ++w)
+ {
+ int t_pad = t * dT - pT + t_offset * dilationT;
+ int h_pad = h * dH - pH + h_offset * dilationH;
+ int w_pad = w * dW - pW + w_offset * dilationW;
+ if (t_pad >= 0 && t_pad < depth &&
+ h_pad >= 0 && h_pad < height &&
+ w_pad >= 0 && w_pad < width)
+ data_vol[((c_vol * depth + t_pad) * height + h_pad) * width + w_pad] +=
+ data_col[((c * depth_col + t) * height_col + h) * width_col + w];
+ }
+ }
+ }
+ }
+}
+
+static inline void THNN_(VolumetricFullConvolution_shapeCheck)(
+ THTensor *input, THTensor *gradOutput,
+ THTensor *weight, THTensor *bias,
+ int dT, int dW, int dH, int pT, int pW, int pH,
+ int aT, int aW, int aH) {
+ THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+ "4D or 5D (batch mode) tensor expected for input, but got: %s");
+ // number of input & output planes and kernel size is indirectly defined by the weight tensor
+ THNN_ARGCHECK(weight->nDimension == 5, 4, weight,
+ "5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor "
+ "expected for weight, but got: %s");
+ THArgCheck(dT > 0 && dW > 0 && dH > 0, 11,
+ "stride should be greater than zero, but got dT: %d dH: %d dW: %d", dT, dH, dW);
+ THArgCheck(aT < dT && aW < dW && aH < dH, 15,
+ "output adjustment must be smaller than stride, but got "
+ "adjT: %d adjH: %d adjW: %d dT: %d dH: %d dW: %d",
+ aT, aH, aW, dT, dH, dW);
+
+ int ndim = input->nDimension;
+ const int nInputPlane = (int)weight->size[0];
+ const int nOutputPlane = (int)weight->size[1];
+ const int kT = (int)weight->size[2];
+ const int kH = (int)weight->size[3];
+ const int kW = (int)weight->size[4];
+
+ if (bias != NULL) {
+ THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[1]);
+ }
+
+ int dimf = 0;
+ int dimd = 1;
+ int dimh = 2;
+ int dimw = 3;
+
+ if (ndim == 5) {
+ dimf++;
+ dimd++;
+ dimh++;
+ dimw++;
+ }
+
+ const long inputWidth = input->size[dimw];
+ const long inputHeight = input->size[dimh];
+ const long inputDepth = input->size[dimd];
+ const long outputWidth = (inputWidth - 1) * dW - 2*pW + kW + aW;
+ const long outputHeight = (inputHeight - 1) * dH - 2*pH + kH + aH;
+ const long outputDepth = (inputDepth - 1) * dT - 2*pT + kT + aT;
+
+ if (outputDepth < 1 || outputWidth < 1 || outputHeight < 1)
+ THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small",
+ nInputPlane,inputDepth,inputHeight,inputWidth,nOutputPlane,outputDepth,outputHeight,outputWidth);
+
+ THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane);
+ if (gradOutput != NULL) {
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane);
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimd, outputDepth);
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight);
+ THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth);
+ }
+}
+
+void THNN_(VolumetricFullConvolution_updateOutput)(
+ THNNState *state,
+ THTensor *input, // 4D or 5D (batch) tensor
+ THTensor *output,
+ THTensor *weight, // weight tensor (nInputPlane x nOutputPlane x kT x kH x kW)
+ THTensor *bias,
+ THTensor *finput, // internal columns buffer
+ THTensor *fgradInput, // internal ones buffer
+ int dT, int dW, int dH, // stride of the convolution
+ int pT, int pW, int pH, // padding
+ int aT, int aW, int aH) // extra output adjustment
+{
+ THTensor *columns = finput;
+ THTensor *ones = fgradInput;
+
+ THNN_(VolumetricFullConvolution_shapeCheck)(
+ input, NULL, weight, bias,
+ dT, dW, dH, pT, pW, pH, aT, aW, aH);
+
+ const int nInputPlane = (int)weight->size[0];
+ const int nOutputPlane = (int)weight->size[1];
+ const int kT = (int)weight->size[2];
+ const int kH = (int)weight->size[3];
+ const int kW = (int)weight->size[4];
+
+ input = THTensor_(newContiguous)(input);
+ weight = THTensor_(newContiguous)(weight);
+ bias = bias ? THTensor_(newContiguous)(bias) : bias;
+ int batch = 1;
+ if (input->nDimension == 4)
+ {
+ // Force batch
+ batch = 0;
+ THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+ }
+
+ const long inputWidth = input->size[4];
+ const long inputHeight = input->size[3];
+ const long inputDepth = input->size[2];
+ const long outputWidth = (inputWidth - 1) * dW - 2*pW + kW + aW;
+ const long outputHeight = (inputHeight - 1) * dH - 2*pH + kH + aH;
+ const long outputDepth = (inputDepth - 1) * dT - 2*pT + kT + aT;
+
+ // Batch size + input planes
+ const long batchSize = input->size[0];
+
+ // Resize output
+ THTensor_(resize5d)(output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth);
+
+ // Resize temporary columns
+ THTensor_(resize2d)(columns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
+ THTensor_(zero)(columns);
+
+ // Define a buffer of ones, for bias accumulation
+ // Note: this buffer can be shared with other modules, it only ever gets increased,
+ // and always contains ones.
+ if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth)
+ {
+ // Resize plane and fill with ones...
+ THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth);
+ THTensor_(fill)(ones, 1);
+ }
+
+ // Helpers
+ THTensor *input_n = THTensor_(new)();
+ THTensor *output_n = THTensor_(new)();
+
+ int elt;
+ // For each elt in batch, do:
+ for (elt = 0; elt < batchSize; ++elt)
+ {
+ // Matrix mulitply per output:
+ THTensor_(select)(input_n, input, 0, elt);
+ THTensor_(select)(output_n, output, 0, elt);
+
+ // M,N,K are dims of matrix A and B
+ // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+ const long m = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4];
+ const long n = columns->size[1];
+ const long k = weight->size[0];
+
+ // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+ THBlas_(gemm)(
+ 'n', 't',
+ n, m, k,
+ 1,
+ THTensor_(data)(input_n), n,
+ THTensor_(data)(weight), m,
+ 0,
+ THTensor_(data)(columns), n
+ );
+
+ // Unpack columns back into input:
+ THNN_(col2vol)(
+ THTensor_(data)(columns),
+ nOutputPlane, outputDepth, outputHeight, outputWidth,
+ kT, kH, kW,
+ pT, pH, pW,
+ dT, dH, dW,
+ 1, 1, 1,
+ THTensor_(data)(output_n)
+ );
+
+ // Do Bias after:
+ // M,N,K are dims of matrix A and B
+ // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+ const long m_ = nOutputPlane;
+ const long n_ = outputDepth * outputHeight * outputWidth;
+ const long k_ = 1;
+
+ // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+ if (bias) {
+ THBlas_(gemm)(
+ 't', 'n',
+ n_, m_, k_,
+ 1,
+ THTensor_(data)(ones), k_,
+ THTensor_(data)(bias), k_,
+ 1,
+ THTensor_(data)(output_n), n_
+ );
+ }
+ }
+
+ // Free
+ THTensor_(free)(input_n);
+ THTensor_(free)(output_n);
+
+ // Resize output
+ if (batch == 0)
+ {
+ THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth);
+ THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
+ }
+
+ THTensor_(free)(input);
+ THTensor_(free)(weight);
+ if (bias) THTensor_(free)(bias);
+}
+
+void THNN_(VolumetricFullConvolution_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THTensor *weight,
+ THTensor *finput,
+ THTensor *fgradInput, // only used by cuda impl
+ int dT, int dW, int dH, // stride
+ int pT, int pW, int pH, // padding
+ int aT, int aW, int aH) // extra output adjustment
+{
+ THTensor *gradColumns = finput;
+
+ // number of input & output planes and kernel size is indirectly defined by the weight tensor
+ THNN_(VolumetricFullConvolution_shapeCheck)(
+ input, gradOutput, weight, NULL,
+ dT, dW, dH, pT, pW, pH, aT, aW, aH);
+
+ const int nInputPlane = (int)weight->size[0];
+ const int nOutputPlane = (int)weight->size[1];
+ const int kT = (int)weight->size[2];
+ const int kH = (int)weight->size[3];
+ const int kW = (int)weight->size[4];
+
+ input = THTensor_(newContiguous)(input);
+ weight = THTensor_(newContiguous)(weight);
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+
+ int batch = 1;
+ if (input->nDimension == 4)
+ {
+ // Force batch
+ batch = 0;
+ THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+ THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+ }
+
+ const long inputWidth = input->size[4];
+ const long inputHeight = input->size[3];
+ const long inputDepth = input->size[2];
+ const long outputWidth = (inputWidth - 1) * dW - 2*pW + kW + aW;
+ const long outputHeight = (inputHeight - 1) * dH - 2*pH + kH + aH;
+ const long outputDepth = (inputDepth - 1) * dT - 2*pT + kT + aT;
+
+ // Batch size + input planes
+ const long batchSize = input->size[0];
+
+ // Resize output
+ THTensor_(resize5d)(gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth);
+ THTensor_(zero)(gradInput);
+
+ // Resize temporary columns
+ THTensor_(resize2d)(gradColumns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
+
+ // Helpers
+ THTensor *gradInput_n = THTensor_(new)();
+ THTensor *gradOutput_n = THTensor_(new)();
+
+ int elt;
+ // For each elt in batch, do:
+ for (elt = 0; elt < batchSize; ++elt)
+ {
+ // Matrix mulitply per sample:
+ THTensor_(select)(gradInput_n, gradInput, 0, elt);
+ THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+ // Extract columns:
+ THNN_(vol2col)(
+ THTensor_(data)(gradOutput_n),
+ nOutputPlane, outputDepth, outputHeight, outputWidth,
+ kT, kH, kW,
+ pT, pH, pW,
+ dT, dH, dW,
+ 1, 1, 1,
+ THTensor_(data)(gradColumns)
+ );
+
+ // M,N,K are dims of matrix A and B
+ // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+ const long m = weight->size[0];
+ const long n = gradColumns->size[1];
+ const long k = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4];
+
+ // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+ THBlas_(gemm)(
+ 'n', 'n',
+ n, m, k,
+ 1,
+ THTensor_(data)(gradColumns), n,
+ THTensor_(data)(weight), k,
+ 0,
+ THTensor_(data)(gradInput_n), n
+ );
+ }
+
+ // Free
+ THTensor_(free)(gradInput_n);
+ THTensor_(free)(gradOutput_n);
+
+ // Resize output
+ if (batch == 0)
+ {
+ THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
+ THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
+ THTensor_(resize4d)(gradInput, nInputPlane, inputDepth, inputHeight, inputWidth);
+ }
+
+ THTensor_(free)(input);
+ THTensor_(free)(gradOutput);
+ THTensor_(free)(weight);
+}
+
+void THNN_(VolumetricFullConvolution_accGradParameters)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradWeight,
+ THTensor *gradBias,
+ THTensor *finput,
+ THTensor *fgradInput,
+ int dT, int dW, int dH, // stride
+ int pT, int pW, int pH, // padding
+ int aT, int aW, int aH, // extra output adjustment
+ accreal scale_)
+{
+ real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
+ // number of input & output planes and kernel size is indirectly defined by the gradWeight tensor
+ THNN_(VolumetricFullConvolution_shapeCheck)(
+ input, gradOutput, gradWeight, gradBias,
+ dT, dW, dH, pT, pW, pH, aT, aW, aH);
+
+ int nInputPlane = (int)gradWeight->size[0];
+ int nOutputPlane = (int)gradWeight->size[1];
+ int kT = (int)gradWeight->size[2];
+ int kH = (int)gradWeight->size[3];
+ int kW = (int)gradWeight->size[4];
+
+ THTensor *columns = finput;
+ THTensor *ones = fgradInput;
+
+ input = THTensor_(newContiguous)(input);
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+ THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous");
+ if (gradBias)
+ THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous");
+
+ int batch = 1;
+ if (input->nDimension == 4)
+ {
+ // Force batch
+ batch = 0;
+ THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]);
+ THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]);
+ }
+
+ const long inputWidth = input->size[4];
+ const long inputHeight = input->size[3];
+ const long inputDepth = input->size[2];
+ const long outputWidth = (inputWidth - 1) * dW - 2*pW + kW + aW;
+ const long outputHeight = (inputHeight - 1) * dH - 2*pH + kH + aH;
+ const long outputDepth = (inputDepth - 1) * dT - 2*pT + kT + aT;
+
+ // Batch size + input planes
+ const long batchSize = input->size[0];
+
+ // Define a buffer of ones, for bias accumulation
+ if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth)
+ {
+ // Resize plane and fill with ones...
+ THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth);
+ THTensor_(fill)(ones, 1);
+ }
+
+ // Resize temporary columns
+ THTensor_(resize2d)(columns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth);
+
+ // Helpers
+ THTensor *input_n = THTensor_(new)();
+ THTensor *gradOutput_n = THTensor_(new)();
+
+ int elt;
+ // For each elt in batch, do:
+ for (elt = 0; elt < batchSize; ++elt)
+ {
+ // Matrix mulitply per output:
+ THTensor_(select)(input_n, input, 0, elt);
+ THTensor_(select)(gradOutput_n, gradOutput, 0, elt);
+
+ // Extract columns:
+ THNN_(vol2col)(
+ THTensor_(data)(gradOutput_n), nOutputPlane,
+ outputDepth, outputHeight, outputWidth,
+ kT, kH, kW,
+ pT, pH, pW,
+ dT, dH, dW,
+ 1, 1, 1,
+ THTensor_(data)(columns)
+ );
+
+ // M,N,K are dims of matrix A and B
+ // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+ const long n = columns->size[0]; // nOutputPlane * kt * kh * kw
+ const long m = input_n->size[0]; // nInputPlane
+ const long k = columns->size[1]; // inputHeight * inputWidth
+
+ // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices)
+ THBlas_(gemm)(
+ 't', 'n',
+ n, m, k,
+ scale,
+ THTensor_(data)(columns), k,
+ THTensor_(data)(input_n), k,
+ 1,
+ THTensor_(data)(gradWeight), n
+ );
+
+ // Do Bias:
+ // M,N,K are dims of matrix A and B
+ // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm)
+ const long m_ = nOutputPlane;
+ const long k_ = outputDepth * outputHeight * outputWidth;
+
+ // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices)
+ if (gradBias) {
+ THBlas_(gemv)(
+ 't',
+ k_, m_,
+ scale,
+ THTensor_(data)(gradOutput_n), k_,
+ THTensor_(data)(ones), 1,
+ 1,
+ THTensor_(data)(gradBias), 1
+ );
+ }
+ }
+
+ // Free
+ THTensor_(free)(input_n);
+ THTensor_(free)(gradOutput_n);
+
+ // Resize
+ if (batch == 0)
+ {
+ THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth);
+ THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth);
+ }
+
+ THTensor_(free)(input);
+ THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricMaxPooling.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricMaxPooling.c
new file mode 100644
index 000000000..a3601e0b6
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricMaxPooling.c
@@ -0,0 +1,50 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricMaxPooling.c"
+#else
+
+void THNN_(VolumetricMaxPooling_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THIndexTensor *indices,
+ int kT,
+ int kW,
+ int kH,
+ int dT,
+ int dW,
+ int dH,
+ int pT,
+ int pW,
+ int pH,
+ bool ceilMode)
+{
+ THNN_(VolumetricDilatedMaxPooling_updateOutput)(
+ state, input, output, indices,
+ kT, kW, kH, dT, dW, dH,
+ pT, pW, pH, 1, 1, 1, ceilMode);
+}
+
+void THNN_(VolumetricMaxPooling_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THIndexTensor *indices,
+ int kT,
+ int kW,
+ int kH,
+ int dT,
+ int dW,
+ int dH,
+ int pT,
+ int pW,
+ int pH,
+ bool ceilMode)
+{
+ THNN_(VolumetricDilatedMaxPooling_updateGradInput)(
+ state, input, gradOutput, gradInput, indices,
+ kT, kW, kH, dT, dW, dH,
+ pT, pW, pH, 1, 1, 1, ceilMode);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricMaxUnpooling.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricMaxUnpooling.c
new file mode 100644
index 000000000..d9d9e5951
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricMaxUnpooling.c
@@ -0,0 +1,373 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricMaxUnpooling.c"
+#else
+
+static inline void THNN_(VolumetricMaxUnpooling_shapeCheck)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THIndexTensor *indices,
+ int oT,
+ int oW,
+ int oH,
+ int dT,
+ int dW,
+ int dH,
+ int pT,
+ int pW,
+ int pH)
+{
+ THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+ "4D or 5D (batch mode) tensor expected for input, but got: %s");
+
+ THNN_CHECK_SHAPE_INDICES(input, indices);
+
+ THArgCheck(dT > 0 && dW > 0 && dH > 0, 10,
+ "stride should be greater than zero, but got dT: %d dH: %d dW: %d",
+ dT, dH, dW);
+
+ int dimw = 3;
+ int dimh = 2;
+ int dimt = 1;
+ int dimn = 0;
+
+ if (input->nDimension == 5)
+ {
+ dimt++;
+ dimw++;
+ dimh++;
+ dimn++;
+ }
+ int nslices = input->size[dimn];
+
+ if (gradOutput != NULL) {
+ if (oT != gradOutput->size[dimt] || oW != gradOutput->size[dimw] || oH != gradOutput->size[dimh])
+ {
+ THError(
+ "Inconsistent gradOutput size. oT= %d, oH= %d, oW= %d, gradOutput: %dx%dx%d",
+ oT, oH, oW, gradOutput->size[dimt], gradOutput->size[dimh], gradOutput->size[dimw]
+ );
+ }
+
+ THNN_CHECK_DIM_SIZE(gradOutput, input->nDimension, dimn, nslices);
+ }
+}
+
+static void THNN_(VolumetricMaxUnpooling_updateOutput_frame)(
+ real *input_p,
+ real *output_p,
+ THIndex_t *ind_p,
+ int nslices,
+ int iT,
+ int iW,
+ int iH,
+ int oT,
+ int oW,
+ int oH,
+ int dT,
+ int dW,
+ int dH,
+ int pT,
+ int pW,
+ int pH)
+{
+ int k;
+ int has_error = 0;
+ THIndex_t error_index;
+#pragma omp parallel for private(k)
+ for (k = 0; k < nslices; k++)
+ {
+ int ti, i, j, maxz, maxy, maxx;
+ for (ti = 0; ti < iT; ti++)
+ {
+ for (i = 0; i < iH; i++)
+ {
+ for (j = 0; j < iW; j++)
+ {
+ int start_t = ti * dT - pT;
+ int start_h = i * dH - pH;
+ int start_w = j * dW - pW;
+
+ real *input_p_k = input_p + k*iT*iW*iH + ti*iW*iH + i*iW + j;
+ THIndex_t *ind_p_k = ind_p + k*iT*iW*iH + ti*iW*iH + i*iW + j;
+
+ maxz = ((unsigned char*)(ind_p_k))[0]; /* retrieve position of max */
+ maxy = ((unsigned char*)(ind_p_k))[1];
+ maxx = ((unsigned char*)(ind_p_k))[2];
+
+ THIndex_t idx = k*oT*oW*oH + oH*oW*(start_t+maxz) + oW*(start_h+maxy) + (start_w+maxx);
+ if (start_t+maxz<0 || start_h+maxy<0 || start_w+maxx<0 || start_t+maxz>=oT
+ || start_h+maxy>=oH || start_w+maxx>=oW)
+ {
+#pragma omp critical
+ {
+ has_error = 1;
+ error_index = idx;
+ }
+ } else {
+ output_p[idx] = *input_p_k; /* update output */
+ }
+ }
+ }
+ }
+ }
+ if (has_error) {
+ THError(
+ "found an invalid max index %ld (output volumes are of size %dx%dx%d)",
+ error_index, oT, oH, oW
+ );
+ }
+}
+
+void THNN_(VolumetricMaxUnpooling_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ THIndexTensor *indices,
+ int oT,
+ int oW,
+ int oH,
+ int dT,
+ int dW,
+ int dH,
+ int pT,
+ int pW,
+ int pH)
+{
+ int dimw = 3;
+ int dimh = 2;
+ int dimt = 1;
+ int nbatch = 1;
+ int nslices;
+ int iT;
+ int iH;
+ int iW;
+ real *input_data;
+ real *output_data;
+ THIndex_t *indices_data;
+
+ THNN_(VolumetricMaxUnpooling_shapeCheck)(
+ state, input, NULL, indices,
+ oT, oW, oH, dT, dW, dH, pT, pW, pH);
+
+ if (input->nDimension == 5)
+ {
+ nbatch = input->size[0];
+ dimt++;
+ dimw++;
+ dimh++;
+ }
+
+ /* sizes */
+ nslices = input->size[dimt-1];
+ iT = input->size[dimt];
+ iH = input->size[dimh];
+ iW = input->size[dimw];
+
+ /* get contiguous input */
+ input = THTensor_(newContiguous)(input);
+ indices = THIndexTensor_(newContiguous)(indices);
+
+ /* resize output */
+ if (input->nDimension == 4)
+ {
+ THTensor_(resize4d)(output, nslices, oT, oH, oW);
+ THTensor_(zero)(output);
+
+ input_data = THTensor_(data)(input);
+ output_data = THTensor_(data)(output);
+ indices_data = THIndexTensor_(data)(indices);
+
+ THNN_(VolumetricMaxUnpooling_updateOutput_frame)(
+ input_data, output_data,
+ indices_data,
+ nslices,
+ iT, iW, iH,
+ oT, oW, oH,
+ dT, dW, dH, pT, pW, pH
+ );
+ }
+ else
+ {
+ int p;
+
+ THTensor_(resize5d)(output, nbatch, nslices, oT, oH, oW);
+ THTensor_(zero)(output);
+
+ input_data = THTensor_(data)(input);
+ output_data = THTensor_(data)(output);
+ indices_data = THIndexTensor_(data)(indices);
+
+ for (p = 0; p < nbatch; p++)
+ {
+ THNN_(VolumetricMaxUnpooling_updateOutput_frame)(
+ input_data+p*nslices*iT*iW*iH,
+ output_data+p*nslices*oT*oW*oH,
+ indices_data+p*nslices*iT*iW*iH,
+ nslices,
+ iT, iW, iH,
+ oT, oW, oH,
+ dT, dW, dH,
+ pT, pW, pH
+ );
+ }
+ }
+
+ /* cleanup */
+ THTensor_(free)(input);
+ THIndexTensor_(free)(indices);
+}
+
+static void THNN_(VolumetricMaxUnpooling_updateGradInput_frame)(
+ real *gradInput_p,
+ real *gradOutput_p,
+ THIndex_t *ind_p,
+ int nslices,
+ int iT,
+ int iW,
+ int iH,
+ int oT,
+ int oW,
+ int oH,
+ int dT,
+ int dW,
+ int dH,
+ int pT,
+ int pW,
+ int pH)
+{
+ int k;
+#pragma omp parallel for private(k)
+ for (k = 0; k < nslices; k++)
+ {
+ int ti, i, j, maxz, maxy, maxx;
+ for (ti = 0; ti < iT; ti++)
+ {
+ for (i = 0; i < iH; i++)
+ {
+ for (j = 0; j < iW; j++)
+ {
+ int start_t = ti * dT - pT;
+ int start_h = i * dH - pH;
+ int start_w = j * dW - pW;
+
+ real *gradInput_p_k = gradInput_p + k*iT*iW*iH + ti*iW*iH + i*iW + j;
+ THIndex_t *ind_p_k = ind_p + k*iT*iW*iH + ti*iW*iH + i*iW + j;
+
+ maxz = ((unsigned char*)(ind_p_k))[0]; /* retrieve position of max */
+ maxy = ((unsigned char*)(ind_p_k))[1];
+ maxx = ((unsigned char*)(ind_p_k))[2];
+
+ if (start_t+maxz<0 || start_h+maxy<0 || start_w+maxx<0
+ || start_t+maxz>=oT || start_h+maxy>=oH || start_w+maxx>=oW)
+ {
+ THError(
+ "invalid max index z= %d, y= %d, x= %d, oT= %d, oW= %d, oH= %d",
+ start_t+maxz, start_h+maxy, start_w+maxx, oT, oW, oH
+ );
+ }
+ *gradInput_p_k = gradOutput_p[k*oT*oW*oH + oH*oW*(start_t+maxz)
+ + oW*(start_h+maxy) + (start_w+maxx)]; /* update gradient */
+ }
+ }
+ }
+ }
+}
+
+void THNN_(VolumetricMaxUnpooling_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ THIndexTensor *indices,
+ int oT,
+ int oW,
+ int oH,
+ int dT,
+ int dW,
+ int dH,
+ int pT,
+ int pW,
+ int pH)
+{
+ int dimw = 3;
+ int dimh = 2;
+ int dimt = 1;
+ int nbatch = 1;
+ int nslices;
+ int iT;
+ int iH;
+ int iW;
+ real *gradInput_data;
+ real *gradOutput_data;
+ THIndex_t *indices_data;
+
+ THNN_(VolumetricMaxUnpooling_shapeCheck)(
+ state, input, gradOutput, indices,
+ oT, oW, oH, dT, dW, dH, pT, pW, pH);
+
+ // TODO: check gradOutput shape
+ /* get contiguous gradOutput */
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+ indices = THIndexTensor_(newContiguous)(indices);
+
+ /* resize */
+ THTensor_(resizeAs)(gradInput, input);
+ THTensor_(zero)(gradInput);
+
+ if (input->nDimension == 5)
+ {
+ nbatch = input->size[0];
+ dimt++;
+ dimw++;
+ dimh++;
+ }
+
+ /* sizes */
+ nslices = input->size[dimt-1];
+ iT = input->size[dimt];
+ iH = input->size[dimh];
+ iW = input->size[dimw];
+
+ /* get raw pointers */
+ gradInput_data = THTensor_(data)(gradInput);
+ gradOutput_data = THTensor_(data)(gradOutput);
+ indices_data = THIndexTensor_(data)(indices);
+
+ /* backprop */
+ if (input->nDimension == 4)
+ {
+ THNN_(VolumetricMaxUnpooling_updateGradInput_frame)(
+ gradInput_data, gradOutput_data,
+ indices_data,
+ nslices,
+ iT, iW, iH,
+ oT, oW, oH,
+ dT, dW, dH,
+ pT, pW, pH
+ );
+ }
+ else
+ {
+ int p;
+ for (p = 0; p < nbatch; p++)
+ {
+ THNN_(VolumetricMaxUnpooling_updateGradInput_frame)(
+ gradInput_data+p*nslices*iT*iW*iH,
+ gradOutput_data+p*nslices*oT*oW*oH,
+ indices_data+p*nslices*iT*iW*iH,
+ nslices,
+ iT, iW, iH,
+ oT, oW, oH,
+ dT, dW, dH,
+ pT, pW, pH
+ );
+ }
+ }
+
+ /* cleanup */
+ THTensor_(free)(gradOutput);
+ THIndexTensor_(free)(indices);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricReplicationPadding.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricReplicationPadding.c
new file mode 100644
index 000000000..4d8993ec2
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricReplicationPadding.c
@@ -0,0 +1,357 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricReplicationPadding.c"
+#else
+
+static inline void THNN_(VolumetricReplicationPadding_shapeCheck)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ int pleft, int pright,
+ int ptop, int pbottom,
+ int pfront, int pback) {
+ int dimw = 3;
+ int dimh = 2;
+ int dimd = 1;
+ int dimslices = 0;
+ long nslices;
+ long idepth;
+ long iheight;
+ long iwidth;
+ long odepth;
+ long oheight;
+ long owidth;
+
+ THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+ "4D or 5D (batch mode) tensor expected for input, but got: %s");
+
+ if (input->nDimension == 5)
+ {
+ dimw++;
+ dimh++;
+ dimd++;
+ dimslices++;
+ }
+
+ /* sizes */
+ nslices = input->size[dimslices];
+ idepth = input->size[dimd];
+ iheight = input->size[dimh];
+ iwidth = input->size[dimw];
+ odepth = idepth + pfront + pback;
+ oheight = iheight + ptop + pbottom;
+ owidth = iwidth + pleft + pright;
+
+ THArgCheck(owidth >= 1 || oheight >= 1 || odepth >= 1, 2,
+ "input (D: %d H: %d, W: %d)is too small."
+ " Calculated output D: %d H: %d W: %d",
+ idepth, iheight, iwidth, odepth, oheight, owidth);
+
+ if (gradOutput != NULL) {
+ THArgCheck(nslices == THTensor_(size)(gradOutput, dimslices), 3,
+ "gradOutput width unexpected. Expected: %d, Got: %d",
+ nslices, THTensor_(size)(gradOutput, dimslices));
+ THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3,
+ "gradOutput width unexpected. Expected: %d, Got: %d",
+ owidth, THTensor_(size)(gradOutput, dimw));
+ THArgCheck(oheight == THTensor_(size)(gradOutput, dimh), 3,
+ "gradOutput height unexpected. Expected: %d, Got: %d",
+ oheight, THTensor_(size)(gradOutput, dimh));
+ THArgCheck(odepth == THTensor_(size)(gradOutput, dimd), 3,
+ "gradOutput depth unexpected. Expected: %d, Got: %d",
+ odepth, THTensor_(size)(gradOutput, dimd));
+ }
+}
+
+static void THNN_(VolumetricReplicationPadding_updateOutput_frame)(
+ real *input_p, real *output_p,
+ long nslices,
+ long iwidth, long iheight, long idepth,
+ long owidth, long oheight, long odepth,
+ int pleft, int pright,
+ int ptop, int pbottom,
+ int pfront, int pback)
+{
+ int iStartX = fmax(0, -pleft);
+ int iStartY = fmax(0, -ptop);
+ int iStartZ = fmax(0, -pfront);
+ int oStartX = fmax(0, pleft);
+ int oStartY = fmax(0, ptop);
+ int oStartZ = fmax(0, pfront);
+
+ long k, ip_x, ip_y, ip_z;
+#pragma omp parallel for private(k, ip_x, ip_y, ip_z)
+ for (k = 0; k < nslices; k++) {
+ long i, j, z;
+ for (z = 0; z < odepth; z++) {
+ for (i = 0; i < oheight; i++) {
+ for (j = 0; j < owidth; j++) {
+ if (j < pleft) {
+ ip_x = pleft;
+ } else if (j >= pleft && j < iwidth + pleft) {
+ ip_x = j;
+ } else {
+ ip_x = iwidth + pleft - 1;
+ }
+ ip_x = ip_x - oStartX + iStartX;
+
+ if (i < ptop) {
+ ip_y = ptop;
+ } else if (i >= ptop && i < iheight + ptop) {
+ ip_y = i;
+ } else {
+ ip_y = iheight + ptop - 1;
+ }
+ ip_y = ip_y - oStartY + iStartY;
+
+ if (z < pfront) {
+ ip_z = pfront;
+ } else if (z >= pfront && z < idepth + pfront) {
+ ip_z = z;
+ } else {
+ ip_z = idepth + pfront - 1;
+ }
+ ip_z = ip_z - oStartZ + iStartZ;
+
+ real *dest_p = output_p + k * owidth * oheight * odepth +
+ z * owidth * oheight + i * owidth + j;
+ real *src_p = input_p + k * iwidth * iheight * idepth +
+ ip_z * iwidth * iheight + ip_y * iwidth + ip_x;
+ *dest_p = *src_p;
+ }
+ }
+ }
+ }
+}
+
+void THNN_(VolumetricReplicationPadding_updateOutput)(THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ int pleft, int pright,
+ int ptop, int pbottom,
+ int pfront, int pback)
+{
+ int dimw = 3;
+ int dimh = 2;
+ int dimd = 1;
+ int dimslices = 0;
+ long nbatch = 1;
+ long nslices;
+ long idepth;
+ long iheight;
+ long iwidth;
+ long odepth;
+ long oheight;
+ long owidth;
+ real *input_data;
+ real *output_data;
+
+THNN_(VolumetricReplicationPadding_shapeCheck)(
+ state, input, NULL, pleft, pright,
+ ptop, pbottom, pfront, pback);
+
+ if (input->nDimension == 5)
+ {
+ nbatch = input->size[0];
+ dimw++;
+ dimh++;
+ dimd++;
+ dimslices++;
+ }
+
+ /* sizes */
+ nslices = input->size[dimslices];
+ idepth = input->size[dimd];
+ iheight = input->size[dimh];
+ iwidth = input->size[dimw];
+ odepth = idepth + pfront + pback;
+ oheight = iheight + ptop + pbottom;
+ owidth = iwidth + pleft + pright;
+
+ /* get contiguous input */
+ input = THTensor_(newContiguous)(input);
+
+ /* resize output */
+ if (input->nDimension == 4)
+ {
+ THTensor_(resize4d)(output, nslices, odepth, oheight, owidth);
+
+ input_data = THTensor_(data)(input);
+ output_data = THTensor_(data)(output);
+
+ THNN_(VolumetricReplicationPadding_updateOutput_frame)(
+ input_data, output_data, nslices, iwidth, iheight, idepth,
+ owidth, oheight, odepth, pleft, pright, ptop, pbottom, pfront,
+ pback);
+ }
+ else
+ {
+ long p;
+
+ THTensor_(resize5d)(output, nbatch, nslices, odepth, oheight, owidth);
+
+ input_data = THTensor_(data)(input);
+ output_data = THTensor_(data)(output);
+
+#pragma omp parallel for private(p)
+ for (p = 0; p < nbatch; p++)
+ {
+ THNN_(VolumetricReplicationPadding_updateOutput_frame)(
+ input_data + p * nslices * iwidth * iheight * idepth,
+ output_data + p * nslices * owidth * oheight * odepth,
+ nslices,
+ iwidth, iheight, idepth,
+ owidth, oheight, odepth,
+ pleft, pright,
+ ptop, pbottom,
+ pfront, pback);
+ }
+ }
+
+ /* cleanup */
+ THTensor_(free)(input);
+}
+
+static void THNN_(VolumetricReplicationPadding_updateGradInput_frame)(
+ real *ginput_p, real *goutput_p,
+ long nslices,
+ long iwidth, long iheight, long idepth,
+ long owidth, long oheight, long odepth,
+ int pleft, int pright,
+ int ptop, int pbottom,
+ int pfront, int pback)
+{
+ int iStartX = fmax(0, -pleft);
+ int iStartY = fmax(0, -ptop);
+ int iStartZ = fmax(0, -pfront);
+ int oStartX = fmax(0, pleft);
+ int oStartY = fmax(0, ptop);
+ int oStartZ = fmax(0, pfront);
+
+ long k, ip_x, ip_y, ip_z;
+#pragma omp parallel for private(k, ip_x, ip_y, ip_z)
+ for (k = 0; k < nslices; k++) {
+ long i, j, z;
+ for (z = 0; z < odepth; z++) {
+ for (i = 0; i < oheight; i++) {
+ for (j = 0; j < owidth; j++) {
+ if (j < pleft) {
+ ip_x = pleft;
+ } else if (j >= pleft && j < iwidth + pleft) {
+ ip_x = j;
+ } else {
+ ip_x = iwidth + pleft - 1;
+ }
+ ip_x = ip_x - oStartX + iStartX;
+
+ if (i < ptop) {
+ ip_y = ptop;
+ } else if (i >= ptop && i < iheight + ptop) {
+ ip_y = i;
+ } else {
+ ip_y = iheight + ptop - 1;
+ }
+ ip_y = ip_y - oStartY + iStartY;
+
+ if (z < pfront) {
+ ip_z = pfront;
+ } else if (z >= pfront && z < idepth + pfront) {
+ ip_z = z;
+ } else {
+ ip_z = idepth + pfront - 1;
+ }
+ ip_z = ip_z - oStartZ + iStartZ;
+
+ real *src_p = goutput_p + k * owidth * oheight * odepth +
+ z * owidth * oheight + i * owidth + j;
+ real *dest_p = ginput_p + k * iwidth * iheight * idepth +
+ ip_z * iwidth * iheight + ip_y * iwidth + ip_x;
+ *dest_p += *src_p;
+ }
+ }
+ }
+ }
+}
+
+void THNN_(VolumetricReplicationPadding_updateGradInput)(THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ int pleft, int pright,
+ int ptop, int pbottom,
+ int pfront, int pback)
+{
+ int dimw = 3;
+ int dimh = 2;
+ int dimd = 1;
+ int dimslices = 0;
+ long nbatch = 1;
+ long nslices;
+ long idepth;
+ long iheight;
+ long iwidth;
+ long odepth;
+ long oheight;
+ long owidth;
+
+ if (input->nDimension == 5)
+ {
+ nbatch = input->size[0];
+ dimw++;
+ dimh++;
+ dimd++;
+ dimslices++;
+ }
+
+ /* sizes */
+ nslices = input->size[dimslices];
+ idepth = input->size[dimd];
+ iheight = input->size[dimh];
+ iwidth = input->size[dimw];
+ odepth = idepth + pfront + pback;
+ oheight = iheight + ptop + pbottom;
+ owidth = iwidth + pleft + pright;
+
+
+THNN_(VolumetricReplicationPadding_shapeCheck)(
+ state, input, NULL, pleft, pright,
+ ptop, pbottom, pfront, pback);
+
+ /* get contiguous gradOutput */
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+
+ /* resize */
+ THTensor_(resizeAs)(gradInput, input);
+ THTensor_(zero)(gradInput);
+
+ /* backprop */
+ if (input->nDimension == 4) {
+ THNN_(VolumetricReplicationPadding_updateGradInput_frame)(
+ THTensor_(data)(gradInput),
+ THTensor_(data)(gradOutput),
+ nslices,
+ iwidth, iheight, idepth,
+ owidth, oheight, odepth,
+ pleft, pright,
+ ptop, pbottom,
+ pfront, pback);
+ } else {
+ long p;
+#pragma omp parallel for private(p)
+ for (p = 0; p < nbatch; p++) {
+ THNN_(VolumetricReplicationPadding_updateGradInput_frame)(
+ THTensor_(data)(gradInput) + p * nslices * idepth * iheight * iwidth,
+ THTensor_(data)(gradOutput) + p * nslices * odepth * oheight * owidth,
+ nslices,
+ iwidth, iheight, idepth,
+ owidth, oheight, odepth,
+ pleft, pright,
+ ptop, pbottom,
+ pfront, pback);
+ }
+ }
+
+ /* cleanup */
+ THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricUpSamplingNearest.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricUpSamplingNearest.c
new file mode 100644
index 000000000..9068fb58d
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricUpSamplingNearest.c
@@ -0,0 +1,226 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricUpSamplingNearest.c"
+#else
+
+
+static inline void THNN_(VolumetricUpSamplingNearest_shapeCheck)
+ (THTensor *input, THTensor *gradOutput,
+ int scale_factor) {
+ THArgCheck(input != NULL, 2, "5D input tensor expected but got NULL");
+ THArgCheck(scale_factor > 1, 4,
+ "scale_factor must be greater than 1, but got: %d", scale_factor);
+ THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input,
+ "4D or 5D input tensor expected but got: %s");
+ if (input->nDimension == 4) {
+ int nChannels = THTensor_(size)(input, 0);
+ int inputDepth = THTensor_(size)(input, 1);
+ int inputHeight = THTensor_(size)(input, 2);
+ int inputWidth = THTensor_(size)(input, 3);
+ int outputDepth = inputDepth * scale_factor;
+ int outputHeight = inputHeight * scale_factor;
+ int outputWidth = inputWidth * scale_factor;
+ if (gradOutput != NULL) {
+ THNN_CHECK_DIM_SIZE(gradOutput, 4, 0, nChannels);
+ THNN_CHECK_DIM_SIZE(gradOutput, 4, 1, outputDepth);
+ THNN_CHECK_DIM_SIZE(gradOutput, 4, 2, outputHeight);
+ THNN_CHECK_DIM_SIZE(gradOutput, 4, 3, outputWidth);
+ }
+ } else {
+ int nBatch = THTensor_(size)(input, 0);
+ int nChannels = THTensor_(size)(input, 1);
+ int inputDepth = THTensor_(size)(input, 2);
+ int inputHeight = THTensor_(size)(input, 3);
+ int inputWidth = THTensor_(size)(input, 4);
+ int outputDepth = inputDepth * scale_factor;
+ int outputHeight = inputHeight * scale_factor;
+ int outputWidth = inputWidth * scale_factor;
+ if (gradOutput != NULL) {
+ THNN_CHECK_DIM_SIZE(gradOutput, 5, 0, nBatch);
+ THNN_CHECK_DIM_SIZE(gradOutput, 5, 1, nChannels);
+ THNN_CHECK_DIM_SIZE(gradOutput, 5, 2, outputDepth);
+ THNN_CHECK_DIM_SIZE(gradOutput, 5, 3, outputHeight);
+ THNN_CHECK_DIM_SIZE(gradOutput, 5, 4, outputWidth);
+ }
+ }
+}
+
+void THNN_(VolumetricUpSamplingNearest_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ int scale_factor)
+{
+ THNN_(VolumetricUpSamplingNearest_shapeCheck)(input, NULL, scale_factor);
+ int inputDepth = THTensor_(size)(input, input->nDimension-3);
+ int inputHeight = THTensor_(size)(input, input->nDimension-2);
+ int inputWidth = THTensor_(size)(input, input->nDimension-1);
+ int outputDepth = inputDepth * scale_factor;
+ int outputHeight = inputHeight * scale_factor;
+ int outputWidth = inputWidth * scale_factor;
+
+ if (input->nDimension == 4) {
+ THTensor_(resize4d)(output,
+ THTensor_(size)(input, 0),
+ outputDepth, outputHeight, outputWidth);
+ } else {
+ THTensor_(resize5d)(output,
+ THTensor_(size)(input, 0),
+ THTensor_(size)(input, 1),
+ outputDepth, outputHeight, outputWidth);
+ }
+
+ int dT = scale_factor;
+ int dW = scale_factor;
+ int dH = scale_factor;
+ int xDim = input->nDimension-3;
+ int yDim = input->nDimension-2;
+ int zDim = input->nDimension-1;
+
+ // dims
+ int idim = input->nDimension;
+ int osz0 = output->size[0];
+ int osz1 = output->size[1];
+ int osz2 = output->size[2];
+ int osz3 = output->size[3];
+ int osz4 = 1;
+ if (idim > 4) {
+ osz4 = output->size[4];
+ }
+
+ // get strides
+ long *is = input->stride;
+ long *os = output->stride;
+
+ // get raw pointers
+ real *pin = THTensor_(data)(input);
+ real *pout = THTensor_(data)(output);
+
+ // perform the upsampling
+ int i0, i1, i2, i3, i4, isrc, idst;
+ int iout[5]; // Output indices
+ int iin[5]; // Input indices
+
+ for (i0 = 0; i0 < osz0; i0++) {
+ iout[0] = i0;
+ iin[0] = i0;
+ for (i1 = 0; i1 < osz1; i1++) {
+ iout[1] = i1;
+ iin[1] = i1;
+ for (i2 = 0; i2 < osz2; i2++) {
+ iout[2] = i2;
+ iin[2] = i2;
+ for (i3 = 0; i3 < osz3; i3++) {
+ iout[3] = i3;
+ iin[3] = i3;
+ for (i4 = 0; i4 < osz4; i4++) {
+ iout[4] = i4;
+ iin[4] = i4;
+
+ // set the indices for the upsampled dimensions
+ iin[xDim] = iout[xDim] / dW;
+ iin[yDim] = iout[yDim] / dH;
+ iin[zDim] = iout[zDim] / dT;
+
+ idst = i0*os[0] + i1*os[1] + i2*os[2] + i3*os[3];
+ isrc = iin[0]*is[0] + iin[1]*is[1] + iin[2]*is[2] + iin[3]*is[3];
+ if (idim > 4) {
+ idst += i4*os[4];
+ isrc += iin[4]*is[4];
+ }
+
+ pout[idst] = pin[isrc];
+ }
+ }
+ }
+ }
+ }
+}
+
+void THNN_(VolumetricUpSamplingNearest_updateGradInput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ int scale_factor)
+{
+ THNN_(VolumetricUpSamplingNearest_shapeCheck)(input, gradOutput, scale_factor);
+ THTensor_(resizeAs)(gradInput, input);
+
+ int dW = scale_factor;
+ int dH = scale_factor;
+ int dT = scale_factor;
+ int xDim = gradInput->nDimension-3;
+ int yDim = gradInput->nDimension-2;
+ int zDim = gradInput->nDimension-1;
+
+ // dims
+ int idim = gradInput->nDimension; // Guaranteed to be between 3 and 5
+ int isz0 = gradInput->size[0];
+ int isz1 = gradInput->size[1];
+ int isz2 = gradInput->size[2];
+ int isz3 = gradInput->size[3];
+ int isz4 = 1;
+ if (idim > 4) {
+ isz4 = gradInput->size[4];
+ }
+
+ // get strides
+ long *is = gradInput->stride;
+ long *os = gradOutput->stride;
+
+ // get raw pointers
+ real *pin = THTensor_(data)(gradInput);
+ real *pout = THTensor_(data)(gradOutput);
+
+ // perform the upsampling
+ int i0, i1, i2, i3, i4, isrc, idst, x, y, z;
+ int iin[5]; // Input indices
+ int iout[5]; // Output indices
+
+ THTensor_(zero)(gradInput);
+
+ for (i0 = 0; i0 < isz0; i0++) {
+ iin[0] = i0;
+ iout[0] = i0;
+ for (i1 = 0; i1 < isz1; i1++) {
+ iin[1] = i1;
+ iout[1] = i1;
+ for (i2 = 0; i2 < isz2; i2++) {
+ iin[2] = i2;
+ iout[2] = i2;
+ for (i3 = 0; i3 < isz3; i3++) {
+ iin[3] = i3;
+ iout[3] = i3;
+
+ for (i4 = 0; i4 < isz4; i4++) {
+ iin[4] = i4;
+ iout[4] = i4;
+
+ idst = i0*is[0] + i1*is[1] + i2*is[2] + i3*is[3];
+ if (idim > 4) {
+ idst += i4*is[4];
+ }
+
+ // Now accumulate the gradients from gradOutput
+ for (z = 0; z < dT; z++) {
+ for (y = 0; y < dH; y++) {
+ for (x = 0; x < dW; x++) {
+ iout[xDim] = dW * iin[xDim] + x;
+ iout[yDim] = dH * iin[yDim] + y;
+ iout[zDim] = dT * iin[zDim] + z;
+ isrc = iout[0]*os[0] + iout[1]*os[1] + iout[2]*os[2] + iout[3]*os[3];
+ if (idim > 4) {
+ isrc += iout[4]*os[4];
+ }
+ pin[idst] += pout[isrc];
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricUpSamplingTrilinear.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricUpSamplingTrilinear.c
new file mode 100644
index 000000000..f2b04dba9
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricUpSamplingTrilinear.c
@@ -0,0 +1,213 @@
+// Adapted from interp.cpp from Caffe util by Pauline Luc
+// Originally developed by George Papandreou
+
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/VolumetricUpSamplingTrilinear.c"
+#else
+
+static inline void THNN_(VolumetricUpSamplingTrilinear_shapeCheck)
+ (THTensor *input, THTensor *gradOutput,
+ int nBatch, int nChannels,
+ int inputDepth, int inputHeight, int inputWidth,
+ int outputDepth, int outputHeight, int outputWidth) {
+ THArgCheck(inputDepth > 0 && inputHeight > 0 && inputWidth > 0
+ && outputDepth > 0 && outputHeight > 0 && outputWidth > 0, 2,
+ "input and output sizes should be greater than 0,"
+ " but got input (D: %d, H: %d, W: %d) output (D: %d, H: %d, W: %d)",
+ inputDepth, inputHeight, inputWidth, outputDepth, outputHeight, outputWidth);
+ if (input != NULL) {
+ THNN_ARGCHECK(input->nDimension == 5, 2, input,
+ "5D input tensor expected but got: %s");
+ }
+
+ if (gradOutput != NULL) {
+ THNN_CHECK_DIM_SIZE(gradOutput, 5, 0, nBatch);
+ THNN_CHECK_DIM_SIZE(gradOutput, 5, 1, nChannels);
+ THNN_CHECK_DIM_SIZE(gradOutput, 5, 2, outputDepth);
+ THNN_CHECK_DIM_SIZE(gradOutput, 5, 3, outputHeight);
+ THNN_CHECK_DIM_SIZE(gradOutput, 5, 4, outputWidth);
+ }
+}
+
+void THNN_(VolumetricUpSamplingTrilinear_updateOutput)(
+ THNNState *state,
+ THTensor *input,
+ THTensor *output,
+ int outputDepth,
+ int outputHeight,
+ int outputWidth){
+
+ int nbatch = THTensor_(size)(input, 0);
+ int channels = THTensor_(size)(input, 1);
+ int inputDepth = THTensor_(size)(input, 2);
+ int inputHeight = THTensor_(size)(input, 3);
+ int inputWidth = THTensor_(size)(input, 4);
+
+ THNN_(VolumetricUpSamplingTrilinear_shapeCheck)
+ (input, NULL,
+ nbatch, channels,
+ inputDepth, inputHeight, inputWidth,
+ outputDepth, outputHeight, outputWidth);
+
+ input = THTensor_(newContiguous)(input);
+ THTensor_(resize5d)(output,
+ THTensor_(size)(input, 0),
+ THTensor_(size)(input, 1),
+ outputDepth, outputHeight, outputWidth);
+ THTensor_(zero)(output);
+ real *idata = THTensor_(data)(input);
+ real *odata = THTensor_(data)(output);
+ channels = nbatch * channels;
+ THAssert(inputDepth > 0 && inputHeight > 0 && inputWidth > 0 &&
+ outputDepth > 0 && outputHeight > 0 && outputWidth > 0);
+ // special case: just copy
+ if (inputDepth == outputDepth && inputHeight == outputHeight && inputWidth == outputWidth) {
+ for (int t2 = 0; t2 < outputDepth; ++t2) {
+ const int t1 = t2;
+ for (int h2 = 0; h2 < outputHeight; ++h2) {
+ const int h1 = h2;
+ for (int w2 = 0; w2 < outputWidth; ++w2) {
+ const int w1 = w2;
+ const real* pos1 = &idata[t1 * inputHeight * inputWidth + h1 * inputWidth + w1];
+ real* pos2 = &odata[t2 * outputHeight * outputWidth + h2 * outputWidth + w2];
+ for (int c = 0; c < channels; ++c) {
+ pos2[0] = pos1[0];
+ pos1 += inputWidth * inputHeight * inputDepth;
+ pos2 += outputWidth * outputHeight * outputDepth;
+ }
+ }
+ }
+ }
+ return;
+ }
+ const float rdepth = (outputDepth > 1) ? (float)(inputDepth - 1)/(outputDepth - 1) : 0.f;
+ const float rheight = (outputHeight > 1) ? (float)(inputHeight - 1)/(outputHeight - 1) : 0.f;
+ const float rwidth = (outputWidth > 1) ? (float)(inputWidth - 1) / (outputWidth - 1) : 0.f;
+ for (int t2 = 0; t2 < outputDepth; ++t2) {
+ const float t1r = rdepth * t2;
+ const int t1 = t1r;
+ const int t1p = (t1 < inputDepth - 1) ? 1 : 0;
+ const real t1lambda = t1r - t1;
+ const real t0lambda = (real)1. - t1lambda;
+ for (int h2 = 0; h2 < outputHeight; ++h2) {
+ const float h1r = rheight * h2;
+ const int h1 = h1r;
+ const int h1p = (h1 < inputHeight - 1) ? 1 : 0;
+ const real h1lambda = h1r - h1;
+ const real h0lambda = (real)1. - h1lambda;
+ for (int w2 = 0; w2 < outputWidth; ++w2) {
+ const float w1r = rwidth * w2;
+ const int w1 = w1r;
+ const int w1p = (w1 < inputWidth - 1) ? 1 : 0;
+ const real w1lambda = w1r - w1;
+ const real w0lambda = (real)1. - w1lambda;
+ const real* pos1 = &idata[t1 * inputHeight * inputWidth + h1 * inputWidth + w1];
+ real* pos2 = &odata[t2 * outputHeight * outputWidth + h2 * outputWidth + w2];
+ for (int c = 0; c < channels; ++c) {
+ pos2[0] = t0lambda * (h0lambda * (w0lambda * pos1[0] + w1lambda * pos1[w1p])
+ + h1lambda * (w0lambda * pos1[h1p * inputWidth]
+ + w1lambda * pos1[h1p * inputWidth + w1p]))
+ + t1lambda * (h0lambda * (w0lambda * pos1[t1p * inputHeight * inputWidth]
+ + w1lambda * pos1[t1p * inputHeight * inputWidth
+ + w1p])
+ + h1lambda * (w0lambda * pos1[t1p * inputHeight * inputWidth
+ + h1p * inputWidth]
+ + w1lambda * pos1[t1p * inputHeight * inputWidth
+ + h1p * inputWidth + w1p]));
+ pos1 += inputWidth * inputHeight * inputDepth;
+ pos2 += outputWidth * outputHeight * outputDepth;
+ }
+ }
+ }
+ }
+ THTensor_(free)(input);
+}
+
+void THNN_(VolumetricUpSamplingTrilinear_updateGradInput)(
+ THNNState *state,
+ THTensor *gradOutput,
+ THTensor *gradInput,
+ int nbatch,
+ int channels,
+ int inputDepth,
+ int inputHeight,
+ int inputWidth,
+ int outputDepth,
+ int outputHeight,
+ int outputWidth){
+
+ THNN_(VolumetricUpSamplingTrilinear_shapeCheck)
+ (NULL, gradOutput,
+ nbatch, channels,
+ inputDepth, inputHeight, inputWidth,
+ outputDepth, outputHeight, outputWidth);
+
+ THTensor_(resize5d)(gradInput, nbatch, channels, inputDepth, inputHeight, inputWidth);
+ THTensor_(zero)(gradInput);
+ gradOutput = THTensor_(newContiguous)(gradOutput);
+ real *data1 = THTensor_(data)(gradInput);
+ real *data2 = THTensor_(data)(gradOutput);
+ channels = nbatch * channels;
+
+ // special case: same-size matching grids
+ if (inputDepth == outputDepth && inputHeight == outputHeight && inputWidth == outputWidth) {
+ for (int t2 = 0; t2 < outputDepth; ++t2) {
+ const int t1 = t2;
+ for (int h2 = 0; h2 < outputHeight; ++h2) {
+ const int h1 = h2;
+ for (int w2 = 0; w2 < outputWidth; ++w2) {
+ const int w1 = w2;
+ real* pos1 = &data1[t1 * inputHeight * inputWidth + h1 * inputWidth + w1];
+ const real* pos2 = &data2[t2 * outputHeight * outputWidth + h2 * outputWidth + w2];
+ for (int c = 0; c < channels; ++c) {
+ pos1[0] += pos2[0];
+ pos1 += inputWidth * inputHeight * inputDepth;
+ pos2 += outputWidth * outputHeight * outputDepth;
+ }
+ }
+ }
+ }
+ return;
+ }
+ const float rdepth = (outputDepth > 1) ? (float)(inputDepth - 1)/(outputDepth - 1) : 0.f;
+ const float rheight = (outputHeight > 1) ? (float)(inputHeight - 1)/(outputHeight - 1) : 0.f;
+ const float rwidth = (outputWidth > 1) ? (float)(inputWidth - 1)/(outputWidth - 1) : 0.f;
+ for (int t2 = 0; t2 < outputDepth; ++t2) {
+ const float t1r = rdepth * t2;
+ const int t1 = t1r;
+ const int t1p = (t1 < inputDepth - 1) ? 1 : 0;
+ const real t1lambda = t1r - t1;
+ const real t0lambda = (real)1. - t1lambda;
+ for (int h2 = 0; h2 < outputHeight; ++h2) {
+ const float h1r = rheight * h2;
+ const int h1 = h1r;
+ const int h1p = (h1 < inputHeight - 1) ? 1 : 0;
+ const real h1lambda = h1r - h1;
+ const real h0lambda = (real)1. - h1lambda;
+ for (int w2 = 0; w2 < outputWidth; ++w2) {
+ const float w1r = rwidth * w2;
+ const int w1 = w1r;
+ const int w1p = (w1 < inputWidth - 1) ? 1 : 0;
+ const real w1lambda = w1r - w1;
+ const real w0lambda = (real)1. - w1lambda;
+ real* pos1 = &data1[t1 * inputHeight * inputWidth + h1 * inputWidth + w1];
+ const real* pos2 = &data2[t2 * outputHeight * outputWidth + h2 * outputWidth + w2];
+ for (int c = 0; c < channels; ++c) {
+ pos1[0] += t0lambda * h0lambda * w0lambda * pos2[0];
+ pos1[w1p] += t0lambda * h0lambda * w1lambda * pos2[0];
+ pos1[h1p * inputWidth] += t0lambda * h1lambda * w0lambda * pos2[0];
+ pos1[h1p * inputWidth + w1p] += t0lambda * h1lambda * w1lambda * pos2[0];
+ pos1[t1p * inputHeight * inputWidth] += t1lambda * h0lambda * w0lambda * pos2[0];
+ pos1[t1p * inputHeight * inputWidth + w1p] += t1lambda * h0lambda * w1lambda * pos2[0];
+ pos1[t1p * inputHeight * inputWidth + h1p * inputWidth] += t1lambda * h1lambda * w0lambda * pos2[0];
+ pos1[t1p * inputHeight * inputWidth + h1p * inputWidth + w1p] += t1lambda * h1lambda * w1lambda * pos2[0];
+ pos1 += inputWidth * inputHeight * inputDepth;
+ pos2 += outputWidth * outputHeight * outputDepth;
+ }
+ }
+ }
+ }
+ THTensor_(free)(gradOutput);
+}
+
+#endif
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/unfold.c b/contrib/lua-torch/nn/lib/THNN/generic/unfold.c
new file mode 100644
index 000000000..14a73b567
--- /dev/null
+++ b/contrib/lua-torch/nn/lib/THNN/generic/unfold.c
@@ -0,0 +1,166 @@
+#ifndef TH_GENERIC_FILE
+#define TH_GENERIC_FILE "generic/unfold.c"
+#else
+
+/* note: due to write issues, this one cannot be parallelized as well as unfolded_copy */
+void THNN_(unfolded_acc)(
+ THTensor *finput,
+ THTensor *input,
+ int kW,
+ int kH,
+ int dW,
+ int dH,
+ int padW,
+ int padH,
+ int nInputPlane,
+ int inputWidth,
+ int inputHeight,
+ int outputWidth,
+ int outputHeight)
+{
+ // This function assumes that
+ // outputHeight*dH does not overflow a long
+ // outputWidth*dW does not overflow a long
+
+ int nip;
+
+ real *input_data = THTensor_(data)(input);
+ real *finput_data = THTensor_(data)(finput);
+
+#pragma omp parallel for private(nip)
+ for(nip = 0; nip < nInputPlane; nip++)
+ {
+ int kw, kh, y, x;
+ long ix, iy;
+ for(kh = 0; kh < kH; kh++)
+ {
+ for(kw = 0; kw < kW; kw++)
+ {
+ real *src = finput_data + nip*((size_t)kH*kW*outputHeight*outputWidth) + kh*((size_t)kW*outputHeight*outputWidth) + kw*((size_t)outputHeight*outputWidth);
+ real *dst = input_data + nip*((size_t)inputHeight*inputWidth);
+ if (padW > 0 || padH > 0) {
+ int lpad,rpad;
+ for(y = 0; y < outputHeight; y++) {
+ iy = (long)y*dH - padH + kh;
+ if (iy < 0 || iy >= inputHeight) {
+ } else {
+ if (dW==1){
+ ix = 0 - padW + kw;
+ lpad = fmaxf(0,padW-kw);
+ rpad = fmaxf(0,padW-(kW-kw-1));
+ real *dst_slice = dst+(size_t)iy*inputWidth+ix+lpad;
+ THVector_(cadd)(dst_slice, dst_slice, src+(size_t)y*outputWidth+lpad, 1, outputWidth - lpad - rpad); /* note: THVector_add could handle 1 value better */
+ }
+ else{
+ for (x=0; x<outputWidth; x++){
+ ix = (long)x*dW - padW + kw;
+ if (ix < 0 || ix >= inputWidth){
+ }else{
+ real *dst_slice = dst+(size_t)iy*inputWidth+ix;
+ THVector_(cadd)(dst_slice, dst_slice, src+(size_t)y*outputWidth+x, 1, 1);
+ }
+ }
+ }
+ }
+ }
+ } else {
+ for(y = 0; y < outputHeight; y++) {
+ iy = (long)y*dH + kh;
+ ix = 0 + kw;
+ if (dW == 1 ) {
+ real *dst_slice = dst+(size_t)iy*inputWidth+ix;
+ THVector_(cadd)(dst_slice, dst_slice, src+(size_t)y*outputWidth, 1, outputWidth); /* note: THVector_add could handle 1 value better */
+ }else{
+ for(x = 0; x < outputWidth; x++) {
+ real *dst_slice = dst+(size_t)iy*inputWidth+ix+x*dW;
+ THVector_(cadd)(dst_slice, dst_slice, src+(size_t)y*outputWidth+x, 1, 1);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+void THNN_(unfolded_copy)(
+ THTensor *finput,
+ THTensor *input,
+ int kW,
+ int kH,
+ int dW,
+ int dH,
+ int padW,
+ int padH,
+ int nInputPlane,
+ int inputWidth,
+ int inputHeight,
+ int outputWidth,
+ int outputHeight)
+{
+ // This function assumes that
+ // kH*kW does not overflow an int
+ // nInputPlane*kH*kW does not overflow a long
+ // outputHeight*dH does not overflow a long
+ // outputWidth*dW does not overflow a long
+
+ long k;
+ real *input_data = THTensor_(data)(input);
+ real *finput_data = THTensor_(data)(finput);
+
+#pragma omp parallel for private(k)
+ for(k = 0; k < (long)nInputPlane*kH*kW; k++) {
+ long nip = k / (kH*kW);
+ long rest = k % (kH*kW);
+ long kh = rest / kW;
+ long kw = rest % kW;
+ int x, y;
+ long ix, iy;
+ real *dst = finput_data + nip*((size_t)kH*kW*outputHeight*outputWidth) + kh*((size_t)kW*outputHeight*outputWidth) + kw*((size_t)outputHeight*outputWidth);
+ real *src = input_data + nip*((size_t)inputHeight*inputWidth);
+ if (padW > 0 || padH > 0) {
+ long lpad,rpad;
+ for(y = 0; y < outputHeight; y++) {
+ iy = (long)y*dH - padH + kh;
+ if (iy < 0 || iy >= inputHeight) {
+ memset(dst+(size_t)y*outputWidth, 0, sizeof(real)*outputWidth);
+ } else {
+ if (dW==1){
+ ix = 0 - padW + kw;
+ lpad = fmaxf(0,padW-kw);
+ rpad = fmaxf(0,padW-(kW-kw-1));
+ if (outputWidth-rpad-lpad <= 0) {
+ memset(dst+(size_t)y*outputWidth, 0, sizeof(real)*outputWidth);
+ } else {
+ if (lpad > 0) memset(dst+(size_t)y*outputWidth, 0, sizeof(real)*lpad);
+ memcpy(dst+(size_t)y*outputWidth+lpad, src+(size_t)iy*inputWidth+ix+lpad, sizeof(real)*(outputWidth-rpad-lpad));
+ if (rpad > 0) memset(dst+(size_t)y*outputWidth + outputWidth - rpad, 0, sizeof(real)*rpad);
+ }
+ }
+ else{
+ for (x=0; x<outputWidth; x++){
+ ix = (long)x*dW - padW + kw;
+ if (ix < 0 || ix >= inputWidth)
+ memset(dst+(size_t)y*outputWidth+x, 0, sizeof(real)*1);
+ else
+ memcpy(dst+(size_t)y*outputWidth+x, src+(size_t)iy*inputWidth+ix, sizeof(real)*(1));
+ }
+ }
+ }
+ }
+ } else {
+ for(y = 0; y < outputHeight; y++) {
+ iy = (long)y*dH + kh;
+ ix = 0 + kw;
+ if (dW == 1)
+ memcpy(dst+(size_t)y*outputWidth, src+(size_t)iy*inputWidth+ix, sizeof(real)*outputWidth);
+ else{
+ for (x=0; x<outputWidth; x++)
+ memcpy(dst+(size_t)y*outputWidth+x, src+(size_t)iy*inputWidth+ix+(long)x*dW, sizeof(real)*(1));
+ }
+ }
+ }
+ }
+}
+
+#endif