diff options
Diffstat (limited to 'contrib/lua-torch/nn/lib/THNN/generic')
73 files changed, 17098 insertions, 0 deletions
diff --git a/contrib/lua-torch/nn/lib/THNN/generic/Abs.c b/contrib/lua-torch/nn/lib/THNN/generic/Abs.c new file mode 100644 index 000000000..28721ec8e --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/Abs.c @@ -0,0 +1,28 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/Abs.c" +#else + +void THNN_(Abs_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output) +{ + THTensor_(resizeAs)(output, input); + THTensor_(abs)(output, input); +} + +void THNN_(Abs_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput) +{ + THNN_CHECK_NELEMENT(input, gradOutput); + THTensor_(resizeAs)(gradInput, input); + TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input, + real z = *input_data; + *gradInput_data = *gradOutput_data * (z >= 0 ? 1 : -1); + ); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/AbsCriterion.c b/contrib/lua-torch/nn/lib/THNN/generic/AbsCriterion.c new file mode 100644 index 000000000..9bee5de9e --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/AbsCriterion.c @@ -0,0 +1,40 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/AbsCriterion.c" +#else + +void THNN_(AbsCriterion_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *target, + THTensor *output, + bool sizeAverage) +{ + real sum = 0; + THNN_CHECK_NELEMENT(input, target); + TH_TENSOR_APPLY2(real, input, real, target, + sum += fabs(*input_data - *target_data); + ); + + if (sizeAverage) + sum /= THTensor_(nElement)(input); + + THTensor_(set1d)(output, 0, sum); +} + +void THNN_(AbsCriterion_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *target, + THTensor *gradInput, + bool sizeAverage) +{ + THNN_CHECK_NELEMENT(input, target); + real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.); + + THTensor_(resizeAs)(gradInput, input); + TH_TENSOR_APPLY3(real, gradInput, real, input, real, target, + *gradInput_data = (*input_data - *target_data) >= 0 ? norm : -norm; + ); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/BCECriterion.c b/contrib/lua-torch/nn/lib/THNN/generic/BCECriterion.c new file mode 100644 index 000000000..637a4067e --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/BCECriterion.c @@ -0,0 +1,66 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/BCECriterion.c" +#else + +#define EPS 1e-12 + +void THNN_(BCECriterion_updateOutput)(THNNState *state, THTensor *input, + THTensor *target, THTensor *output, + bool sizeAverage, THTensor *weights) +{ + THNN_CHECK_NELEMENT(input, target); + THNN_CHECK_NELEMENT(input, weights); + THNN_CHECK_DIM_SIZE(output, 1, 0, 1); + real sum = 0; + + if(weights) + TH_TENSOR_APPLY3(real, input, real, target, real, weights, + real x = *input_data; + real y = *target_data; + real w = *weights_data; + THAssertMsg(x >= 0. && x <= 1., + "input value should be between 0~1, but got %f", + (double) x); + sum -= (log(x + EPS) * y + log(1. - x + EPS) * (1. - y)) * w; + ) + else + TH_TENSOR_APPLY2(real, input, real, target, + real x = *input_data; + real y = *target_data; + THAssertMsg(x >= 0. && x <= 1., + "input value should be between 0~1, but got %f", + (double) x); + sum -= log(x + EPS) * y + log(1. - x + EPS) * (1. - y); + ); + + + if (sizeAverage) + sum /= THTensor_(nElement)(input); + + THTensor_(set1d)(output, 0, sum); +} + +void THNN_(BCECriterion_updateGradInput)(THNNState *state, THTensor *input, + THTensor *target, THTensor *gradInput, + bool sizeAverage, THTensor *weights) +{ + THNN_CHECK_NELEMENT(input, target); + THNN_CHECK_NELEMENT(input, weights); + + real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.); + + THTensor_(resizeAs)(gradInput, input); + + TH_TENSOR_APPLY3(real, gradInput, real, input, real, target, + real x = *input_data; + real y = *target_data; + *gradInput_data = - norm * (y - x) / ((1. - x + EPS) * (x + EPS)); + ); + + if(weights) + THTensor_(cmul)(gradInput, gradInput, weights); +} + +#undef EPS + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/BatchNormalization.c b/contrib/lua-torch/nn/lib/THNN/generic/BatchNormalization.c new file mode 100644 index 000000000..b8f462790 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/BatchNormalization.c @@ -0,0 +1,149 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/BatchNormalization.c" +#else + +void THNN_(BatchNormalization_updateOutput)( + THNNState *state, THTensor *input, THTensor *output, + THTensor *weight, THTensor *bias, + THTensor *running_mean, THTensor *running_var, + THTensor *save_mean, THTensor *save_std, + bool train, double momentum, double eps) +{ + THTensor_(resizeAs)(output, input); + long nInput = THTensor_(size)(input, 1); + long f; + ptrdiff_t n = THTensor_(nElement)(input) / nInput; + + #pragma omp parallel for + for (f = 0; f < nInput; ++f) { + THTensor *in = THTensor_(newSelect)(input, 1, f); + THTensor *out = THTensor_(newSelect)(output, 1, f); + + real mean, invstd; + + if (train) { + // compute mean per input + accreal sum = 0; + TH_TENSOR_APPLY(real, in, sum += *in_data;); + + mean = (real) sum / n; + THTensor_(set1d)(save_mean, f, (real) mean); + + // compute variance per input + sum = 0; + TH_TENSOR_APPLY(real, in, + sum += (*in_data - mean) * (*in_data - mean);); + + if (sum == 0 && eps == 0.0) { + invstd = 0; + } else { + invstd = (real) (1 / sqrt(sum/n + eps)); + } + THTensor_(set1d)(save_std, f, (real) invstd); + + // update running averages + THTensor_(set1d)(running_mean, f, + (real) (momentum * mean + (1 - momentum) * THTensor_(get1d)(running_mean, f))); + + accreal unbiased_var = sum / (n - 1); + THTensor_(set1d)(running_var, f, + (real) (momentum * unbiased_var + (1 - momentum) * THTensor_(get1d)(running_var, f))); + } else { + mean = THTensor_(get1d)(running_mean, f); + invstd = 1 / sqrt(THTensor_(get1d)(running_var, f) + eps); + } + + // compute output + real w = weight ? THTensor_(get1d)(weight, f) : 1; + real b = bias ? THTensor_(get1d)(bias, f) : 0; + + TH_TENSOR_APPLY2(real, in, real, out, + *out_data = (real) (((*in_data - mean) * invstd) * w + b);); + + THTensor_(free)(out); + THTensor_(free)(in); + } +} + +void THNN_(BatchNormalization_backward)( + THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, + THTensor *gradWeight, THTensor *gradBias, THTensor *weight, + THTensor *running_mean, THTensor *running_var, + THTensor *save_mean, THTensor *save_std, + bool train, double scale, double eps) +{ + THNN_CHECK_SHAPE(input, gradOutput); + long nInput = THTensor_(size)(input, 1); + long f; + ptrdiff_t n = THTensor_(nElement)(input) / nInput; + + #pragma omp parallel for + for (f = 0; f < nInput; ++f) { + THTensor *in = THTensor_(newSelect)(input, 1, f); + THTensor *gradOut = THTensor_(newSelect)(gradOutput, 1, f); + real w = weight ? THTensor_(get1d)(weight, f) : 1; + real mean, invstd; + if (train) { + mean = THTensor_(get1d)(save_mean, f); + invstd = THTensor_(get1d)(save_std, f); + } else { + mean = THTensor_(get1d)(running_mean, f); + invstd = 1 / sqrt(THTensor_(get1d)(running_var, f) + eps); + } + + // sum over all gradOutput in feature plane + accreal sum = 0; + TH_TENSOR_APPLY(real, gradOut, sum += *gradOut_data;); + + // dot product of the Q(X) and gradOuput + accreal dotp = 0; + TH_TENSOR_APPLY2(real, in, real, gradOut, + dotp += (*in_data - mean) * (*gradOut_data);); + + if (gradInput) { + THTensor_(resizeAs)(gradInput, input); + THTensor *gradIn = THTensor_(newSelect)(gradInput, 1, f); + + if (train) { + // when in training mode + // Q(X) = X - E[x] ; i.e. input centered to zero mean + // Y = Q(X) / σ ; i.e. BN output before weight and bias + // dL/dX = (Q(dL/dY) - dot(Y, dL/dY) * Y) / σ * w + + // projection of gradOutput on to output scaled by std + real k = (real) dotp * invstd * invstd / n; + TH_TENSOR_APPLY2(real, gradIn, real, in, + *gradIn_data = (*in_data - mean) * k;); + + accreal gradMean = sum / n; + TH_TENSOR_APPLY2(real, gradIn, real, gradOut, + *gradIn_data = (*gradOut_data - gradMean - *gradIn_data) * invstd * w;); + + } else { + // when in evaluation mode + // Q(X) = X - running_mean ; i.e. input centered to zero mean + // Y = Q(X) / running_std ; i.e. BN output before weight and bias + // dL/dX = w / running_std + TH_TENSOR_APPLY2(real, gradIn, real, gradOut, + *gradIn_data = *gradOut_data * invstd * w;); + } + + THTensor_(free)(gradIn); + } + + if (gradWeight) { + real val = THTensor_(get1d)(gradWeight, f); + THTensor_(set1d)(gradWeight, f, val + scale * dotp * invstd); + } + + if (gradBias) { + real val = THTensor_(get1d)(gradBias, f); + THTensor_(set1d)(gradBias, f, val + scale * sum); + } + + THTensor_(free)(gradOut); + THTensor_(free)(in); + } +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/ClassNLLCriterion.c b/contrib/lua-torch/nn/lib/THNN/generic/ClassNLLCriterion.c new file mode 100644 index 000000000..4cf37aeaf --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/ClassNLLCriterion.c @@ -0,0 +1,163 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/ClassNLLCriterion.c" +#else + +void THNN_(ClassNLLCriterion_updateOutput)( + THNNState *state, + THTensor *input, + THIndexTensor *target, + THTensor *output, + bool sizeAverage, + THTensor *weights, + THTensor *total_weight, + long ignore_index) +{ + THNN_CHECK_DIM_SIZE(output, 1, 0, 1); + THNN_CHECK_DIM_SIZE(total_weight, 1, 0, 1); + int n_dims = THTensor_(nDimension)(input); + int n_classes = THTensor_(size)(input, n_dims - 1); + ignore_index -= TH_INDEX_BASE; + + if (THIndexTensor_(nDimension)(target) > 1) { + THError("multi-target not supported"); + } + if (THTensor_(nDimension)(input) > 2) { + THError("input tensor should be 1D or 2D"); + } + if (weights && THTensor_(nElement)(weights) != n_classes) { + THDescBuff s1 = THTensor_(sizeDesc)(weights); + THError("weight tensor should be defined either for all %d classes or no classes" + " but got weight tensor of shape: %s", n_classes, s1.str); + } + + input = THTensor_(newContiguous)(input); + target = THIndexTensor_(newContiguous)(target); + weights = weights ? THTensor_(newContiguous)(weights) : NULL; + + real *input_data = THTensor_(data)(input); + THIndex_t *target_data = THIndexTensor_(data)(target); + real *weights_data = weights ? THTensor_(data)(weights) : NULL; + real *output_data = THTensor_(data)(output); + real *total_weight_data = THTensor_(data)(total_weight); + + output_data[0] = total_weight_data[0] = 0.0; + + if (THTensor_(nDimension)(input) == 1) { + int cur_target = target_data[0] - TH_INDEX_BASE; + if (cur_target != ignore_index) { + THAssert(cur_target >= 0 && cur_target < n_classes); + total_weight_data[0] = weights ? weights_data[cur_target] : 1.0f; + output_data[0] = -input_data[cur_target] * total_weight_data[0]; + } + } else if (THTensor_(nDimension)(input) == 2) { + int batch_size = THTensor_(size)(input, 0); + THAssert(THIndexTensor_(size)(target, 0) == batch_size); + + int n_target = THTensor_(size)(input, 1); + + int i; + for (i = 0; i < batch_size; i++) { + int cur_target = target_data[i] - TH_INDEX_BASE; + if (cur_target != ignore_index) { + THAssert(cur_target >= 0 && cur_target < n_classes); + + real cur_weight = weights ? weights_data[cur_target] : 1.0f; + total_weight_data[0] += cur_weight; + output_data[0] -= input_data[i * n_target + cur_target] * cur_weight; + } + } + } + + if (sizeAverage && total_weight_data[0]) { + output_data[0] /= total_weight_data[0]; + } + + if (weights) { + THTensor_(free)(weights); + } + THTensor_(free)(input); + THIndexTensor_(free)(target); +} + +void THNN_(ClassNLLCriterion_updateGradInput)( + THNNState *state, + THTensor *input, + THIndexTensor *target, + THTensor *gradInput, + bool sizeAverage, + THTensor *weights, + THTensor *total_weight, + long ignore_index) +{ + int n_dims = THTensor_(nDimension)(input); + int n_classes = THTensor_(size)(input, n_dims - 1); + ignore_index -= TH_INDEX_BASE; + + if (!THTensor_(isContiguous)(gradInput)) { + THError("gradInput must be contiguous"); + } + + real *total_weight_data = THTensor_(data)(total_weight); + + if (!(*total_weight_data > 0)) { + return; + } + + if (THIndexTensor_(nDimension)(target) > 1) { + THError("multi-target not supported"); + } + + if (THTensor_(nDimension)(input) > 2) { + THError("input tensor should be 1D or 2D"); + } + + if (weights && THTensor_(nElement)(weights) != n_classes) { + THError("weight tensor should be defined either for all or no classes"); + } + + target = THIndexTensor_(newContiguous)(target); + weights = weights ? THTensor_(newContiguous)(weights) : NULL; + + THIndex_t *target_data = THIndexTensor_(data)(target); + real *weights_data = weights ? THTensor_(data)(weights) : NULL; + real *gradInput_data = THTensor_(data)(gradInput); + + if (THTensor_(nDimension)(input) == 1) { + int cur_target = target_data[0] - TH_INDEX_BASE; + if (cur_target != ignore_index) { + THAssert(cur_target >= 0 && cur_target < n_classes); + + gradInput_data[cur_target] = + (!sizeAverage && weights) ? -weights_data[cur_target] : -1; + } + + } else if (THTensor_(nDimension)(input) == 2) { + int batch_size = THTensor_(size)(input, 0); + THAssert(THIndexTensor_(size)(target, 0) == batch_size); + + int n_target = THTensor_(size)(input, 1); + + int i; + for (i = 0; i < batch_size; i++){ + int cur_target = target_data[i] - TH_INDEX_BASE; + + if (cur_target != ignore_index) { + THAssert(cur_target >= 0 && cur_target < n_classes); + + gradInput_data[i * n_target + cur_target] = + -(weights ? weights_data[cur_target] : 1.0f); + + if (sizeAverage && *total_weight_data) { + gradInput_data[i * n_target + cur_target] /= *total_weight_data; + } + } + } + } + + THIndexTensor_(free)(target); + if (weights) { + THTensor_(free)(weights); + } +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/DistKLDivCriterion.c b/contrib/lua-torch/nn/lib/THNN/generic/DistKLDivCriterion.c new file mode 100644 index 000000000..6bd6aa067 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/DistKLDivCriterion.c @@ -0,0 +1,44 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/DistKLDivCriterion.c" +#else + +void THNN_(DistKLDivCriterion_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *target, + THTensor *output, + bool sizeAverage) +{ + THNN_CHECK_NELEMENT(input, target); + THNN_CHECK_DIM_SIZE(output, 1, 0, 1); + + real sum = 0; + + TH_TENSOR_APPLY2(real, input, real, target, + sum += *target_data > 0 ? *target_data * (log(*target_data) - *input_data) : 0; + ); + + if (sizeAverage) + sum /= THTensor_(nElement)(input); + + THTensor_(set1d)(output, 0, sum); +} + +void THNN_(DistKLDivCriterion_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *target, + THTensor *gradInput, + bool sizeAverage) +{ + THNN_CHECK_NELEMENT(input, target); + + real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.); + + THTensor_(resizeAs)(gradInput, input); + TH_TENSOR_APPLY3(real, gradInput, real, input, real, target, + *gradInput_data = *target_data > 0 ? norm * (-*target_data) : 0; + ); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/ELU.c b/contrib/lua-torch/nn/lib/THNN/generic/ELU.c new file mode 100644 index 000000000..ddcfb9705 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/ELU.c @@ -0,0 +1,54 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/ELU.c" +#else + +void THNN_(ELU_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + accreal alpha_, + bool inplace) +{ + real alpha = TH_CONVERT_ACCREAL_TO_REAL(alpha_); + if(inplace) { + TH_TENSOR_APPLY(real, input, + if(*input_data <= 0) { + *input_data = (exp(*input_data) - 1) * alpha; + } + ); + THTensor_(set)(output, input); + } else { + THTensor_(resizeAs)(output, input); + TH_TENSOR_APPLY2(real, input, real, output, + *output_data = *input_data <= 0 ? (exp(*input_data)-1)*alpha : *input_data; + ); + } +} + +void THNN_(ELU_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *output, + accreal alpha_, + bool inplace) +{ + real alpha = TH_CONVERT_ACCREAL_TO_REAL(alpha_); + THNN_CHECK_NELEMENT(input, gradOutput); + if(inplace) { + TH_TENSOR_APPLY2(real, gradOutput, real, output, + if(*output_data <= 0) { + *gradOutput_data *= *output_data + alpha; + } + ); + THTensor_(set)(gradInput, gradOutput); + } else { + THTensor_(resizeAs)(gradInput, output); + TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output, + *gradInput_data = *output_data <= 0 ? *gradOutput_data * (*output_data + alpha) : *gradOutput_data; + ); + } +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/FusedRNNKernel.c b/contrib/lua-torch/nn/lib/THNN/generic/FusedRNNKernel.c new file mode 100644 index 000000000..30788b0a2 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/FusedRNNKernel.c @@ -0,0 +1,55 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/FusedRNNKernel.c" +#else + +void THNN_(GRUFused_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *hidden, + THTensor *bias1, + THTensor *bias2, + THTensor *hx, + THTensor *hy, + THTensor *storage) +{ + THAssertMsg(false, "Not implemented for CPU"); +} + +void THNN_(GRUFused_updateGradInput)( + THNNState *state, + THTensor *gradInInput, + THTensor *gradInHidden, + THTensor *gradOutput, + THTensor *gradInputHx, + THTensor *storage) +{ + THAssertMsg(false, "Not implemented for CPU"); +} + +void THNN_(LSTMFused_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *hidden, + THTensor *bias1, + THTensor *bias2, + THTensor *cx, + THTensor *hy, + THTensor *cy) +{ + THAssertMsg(false, "Not implemented for CPU"); +} + +void THNN_(LSTMFused_updateGradInput)( + THNNState *state, + THTensor *storage, + THTensor *gradInGates, + THTensor *prevC, + THTensor *cy, + THTensor *gradOutput, + THTensor *gradOutputCell, + THTensor *gradInputCx) +{ + THAssertMsg(false, "Not implemented for CPU"); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/GatedLinearUnit.c b/contrib/lua-torch/nn/lib/THNN/generic/GatedLinearUnit.c new file mode 100644 index 000000000..274a27e3b --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/GatedLinearUnit.c @@ -0,0 +1,73 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/GatedLinearUnit.c" +#else + +void THNN_(GatedLinear_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int dim) +{ + // size output to half of input + dim = dim - TH_INDEX_BASE; + const long nIn = THTensor_(size)(input, dim); + THArgCheck(nIn % 2 == 0, 2, "Halving dimension must be even. Dim %d is size %ld", + dim + TH_INDEX_BASE, nIn); + + const long inputSize = THTensor_(size)(input, dim) / 2; + THLongStorage *newSizes = THTensor_(newSizeOf)(input); + THLongStorage_set(newSizes, dim, inputSize); + THTensor_(resize)(output, newSizes, NULL); + + // halve tensor + THTensor *firstHalf = THTensor_(newNarrow)(input, dim, 0, inputSize); + THTensor *secondHalf = THTensor_(newNarrow)(input, dim, inputSize, inputSize); + + // x = x1:cmul( sigmoid(x2) ) + THTensor_(sigmoid)(output, secondHalf); + THTensor_(cmul)(output, output, firstHalf); + + THLongStorage_free(newSizes); + THTensor_(free)(firstHalf); + THTensor_(free)(secondHalf); +} + +void THNN_(GatedLinear_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + int dim) +{ + // set up tensors + dim = dim - TH_INDEX_BASE; + const long nIn = THTensor_(size)(input, dim); + THArgCheck(nIn % 2 == 0, 2, "Halving dimension must be even. Dim %d is size %ld", + dim + TH_INDEX_BASE, nIn); + + THTensor_(resizeAs)(gradInput, input); + const long inputSize = THTensor_(size)(input, dim) / 2; + THTensor *firstHalf = THTensor_(newNarrow)(input, dim, 0, inputSize); + THTensor *secondHalf = THTensor_(newNarrow)(input, dim, inputSize, inputSize); + THTensor *gradInputfirstHalf = THTensor_(newNarrow)(gradInput, dim, 0, inputSize); + THTensor *gradInputsecondHalf = THTensor_(newNarrow)(gradInput, dim, inputSize, inputSize); + + THTensor_(sigmoid)(gradInputfirstHalf, secondHalf); + + TH_TENSOR_APPLY2(real, gradInputsecondHalf, real, gradInputfirstHalf, + real z = *gradInputfirstHalf_data; + *gradInputsecondHalf_data = (1. - z) * z; + ); + + THTensor_(cmul)(gradInputfirstHalf, gradInputfirstHalf, gradOutput); + + THTensor_(cmul)(gradInputsecondHalf, gradInputsecondHalf, gradOutput); + THTensor_(cmul)(gradInputsecondHalf, gradInputsecondHalf, firstHalf); + + THTensor_(free)(firstHalf); + THTensor_(free)(secondHalf); + THTensor_(free)(gradInputfirstHalf); + THTensor_(free)(gradInputsecondHalf); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/HardShrink.c b/contrib/lua-torch/nn/lib/THNN/generic/HardShrink.c new file mode 100644 index 000000000..aaae85bac --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/HardShrink.c @@ -0,0 +1,42 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/HardShrink.c" +#else + +void THNN_(HardShrink_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + accreal lambda_) +{ + real lambda = TH_CONVERT_ACCREAL_TO_REAL(lambda_); + THTensor_(resizeAs)(output, input); + + TH_TENSOR_APPLY2(real, output, real, input, + if (*input_data > lambda) + *output_data = *input_data; + else if (*input_data < -lambda) + *output_data = *input_data; + else + *output_data = 0; + ); +} + +void THNN_(HardShrink_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + accreal lambda_) +{ + real lambda = TH_CONVERT_ACCREAL_TO_REAL(lambda_); + THNN_CHECK_NELEMENT(input, gradOutput); + THTensor_(resizeAs)(gradInput, input); + TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input, + if (*input_data > lambda || *input_data < -lambda) + *gradInput_data = *gradOutput_data; + else + *gradInput_data = 0; + ); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/HardTanh.c b/contrib/lua-torch/nn/lib/THNN/generic/HardTanh.c new file mode 100644 index 000000000..589a66e15 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/HardTanh.c @@ -0,0 +1,133 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/HardTanh.c" +#else + +void THNN_(HardTanh_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + accreal min_val_, + accreal max_val_, + bool inplace) +{ + real min_val = TH_CONVERT_ACCREAL_TO_REAL(min_val_); + real max_val = TH_CONVERT_ACCREAL_TO_REAL(max_val_); + if (inplace) + THTensor_(set)(output, input); + else + THTensor_(resizeAs)(output, input); + + if (input->nDimension == 1 || !THTensor_(isContiguous)(input) || !THTensor_(isContiguous)(output)) + { + if (inplace) + TH_TENSOR_APPLY(real, input, + if (*input_data < min_val) + *input_data = min_val; + else if (*input_data > max_val) + *input_data = max_val; + ); + TH_TENSOR_APPLY2(real, output, real, input, + if (*input_data < min_val) + *output_data = min_val; + else if (*input_data <= max_val) + *output_data = *input_data; + else + *output_data = max_val; + ); + } + else + { + real* ptr_input = THTensor_(data)(input); + real* ptr_output = THTensor_(data)(output); + ptrdiff_t i; + ptrdiff_t n = THTensor_(nElement)(input); + + if (inplace) +#pragma omp parallel for private(i) + for (i = 0; i < n; i++) + { + if (ptr_input[i] < min_val) + ptr_input[i] = min_val; + else if (ptr_input[i] > max_val) + ptr_input[i] = max_val; + } + else +#pragma omp parallel for private(i) + for (i = 0; i < n; i++) + { + if (ptr_input[i] < min_val) + ptr_output[i] = min_val; + else if (ptr_input[i] <= max_val) + ptr_output[i] = ptr_input[i]; + else + ptr_output[i] = max_val; + } + } +} + +void THNN_(HardTanh_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + accreal min_val_, + accreal max_val_, + bool inplace) +{ + real min_val = TH_CONVERT_ACCREAL_TO_REAL(min_val_); + real max_val = TH_CONVERT_ACCREAL_TO_REAL(max_val_); + + THNN_CHECK_NELEMENT(input, gradOutput); + if (inplace) + THTensor_(set)(gradInput, gradOutput); + else + THTensor_(resizeAs)(gradInput, input); + + if (input->nDimension == 1 || + !THTensor_(isContiguous)(input) || + !THTensor_(isContiguous)(gradOutput) || + !THTensor_(isContiguous)(gradInput)) + { + if (inplace) + { + TH_TENSOR_APPLY2(real, gradOutput, real, input, + if (*input_data <= min_val || *input_data >= max_val) + *gradOutput_data = 0; + ); + } + else + TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input, + if (*input_data <= min_val || *input_data >= max_val) + *gradInput_data = 0; + else + *gradInput_data = *gradOutput_data; + ); + } + else + { + real* ptr_gradOutput = THTensor_(data)(gradOutput); + real* ptr_gradInput = THTensor_(data)(gradInput); + real* ptr_input = THTensor_(data)(input); + ptrdiff_t i; + ptrdiff_t n = THTensor_(nElement)(input); + + if (inplace) +#pragma omp parallel for private(i) + for (i = 0; i < n; i++) + { + if (ptr_input[i] <= min_val || ptr_input[i] >= max_val) + ptr_gradInput[i] = 0; + } + else +#pragma omp parallel for private(i) + for (i = 0; i < n; i++) + { + if (ptr_input[i] <= min_val || ptr_input[i] >= max_val) + ptr_gradInput[i] = 0; + else + ptr_gradInput[i] = ptr_gradOutput[i]; + } + } +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/IndexLinear.c b/contrib/lua-torch/nn/lib/THNN/generic/IndexLinear.c new file mode 100644 index 000000000..42d8368ba --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/IndexLinear.c @@ -0,0 +1,742 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/IndexLinear.c" +#else + +#ifdef _OPENMP +#include <omp.h> +#endif + +/* Threshold used to trigger multithreading */ +#ifndef THNN_SPARSE_OMP_THRESHOLD +#define THNN_SPARSE_OMP_THRESHOLD 100000 +#endif + +/* Threshold used to trigger BLAS axpy call */ +#ifndef THNN_SPARSE_OUTDIM_THRESHOLD +#define THNN_SPARSE_OUTDIM_THRESHOLD 49 +#endif + +/* sign MACRO */ +#ifndef THNN_INDEXLINEAR_SIGN +#define THNN_INDEXLINEAR_SIGN(a) ( ( (a) < 0 ) ? -1 : ( (a) > 0 ) ) +#endif + +static bool THNN_(checkKeysValues)(THLongTensor* keys, THTensor* values) +{ + return THLongTensor_size(keys, 0) == THTensor_(nElement)(values) + && THTensor_(nDimension)(values) == 1 + && THLongTensor_nDimension(keys) == 1; +} + +void THNN_(IndexLinear_updateOutput)( + THNNState *state, + THLongTensor *keys, + long keysOffset, + THTensor *values, + THLongTensor *sizes, + THLongTensor *cumSumSizes, + THTensor *output, + THTensor *weight, + THTensor *bias, + THTensor *normalizedValues, + int train) +{ + /* Retrieve all the dimensions of the problem */ + long batchSize = THLongTensor_size(sizes, 0); + long keysSize = THLongTensor_size(keys, 0); + long outDim = THTensor_(size)(bias, 0); + long woutDim = THTensor_(size)(weight, 1); + int maxNormalize = woutDim - outDim; + long* sizesData = THLongTensor_data(sizes); + long* cumSumSizesData = THLongTensor_data(cumSumSizes); + + /* Define/resize the normalized values tensor if maxNormalize is > 0 */ + real* normalizedValuesData = NULL; + if (maxNormalize) + { + THTensor_(resize1d)(normalizedValues, keysSize); + normalizedValuesData = THTensor_(data)(normalizedValues); + } + + /* Resize the output */ + THTensor_(resize2d)(output, batchSize, outDim); + + /* Access the storage data/strides */ + real* outputData = THTensor_(data)(output); + real* valuesData = THTensor_(data)(values); + real* weightData = THTensor_(data)(weight); + long weightStride0 = weight->stride[0]; + real* biasData = THTensor_(data)(bias); + long* keysData = THLongTensor_data(keys); + + /* Make sure these inputs are contiguous to accelerate computations */ + THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous"); + THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous"); + THArgCheck(THTensor_(isContiguous)(output), 6, "output vector must be contiguous"); + THArgCheck(THTensor_(isContiguous)(weight), 7, "weight matrix must be contiguous"); + THArgCheck(THTensor_(isContiguous)(bias), 8, "bias vector must be contiguous"); + THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements"); + THArgCheck(THTensor_(isContiguous)(normalizedValues), 9, "normalizedValues vector must be contiguous"); + long i,j,k; + + /* Separate cases: output dimension is == 1, or > 1 + * This allows for some optimizations. */ + if (outDim == 1) + { + THVector_(fill)(outputData, *biasData, batchSize); + if (maxNormalize) + { + /* Parallelize on the batch itself */ +#pragma omp parallel \ + for private(i,j) \ + firstprivate(outDim, keysOffset, \ + weightData, keysData, \ + valuesData, outputData, \ + cumSumSizesData, sizesData) \ + schedule(static) \ + if(keysSize*outDim > THNN_SPARSE_OMP_THRESHOLD && batchSize > 1) + for (j = 0; j < batchSize; j++) + { + real* loutputData = outputData + j; + real val = 0; + real absVal = 0; + long offset = j == 0 ? 0 : cumSumSizesData[j - 1]; + + for (i = 0; i < sizesData[j]; i++) + { + long woffset = weightStride0*(keysData[offset] + keysOffset); + absVal = fabs(valuesData[offset]); + if (train) + { + if (absVal > weightData[woffset]) + { + weightData[woffset] = absVal; + weightData[woffset+1] = 1/absVal; + } + + /* + * The following can be used to scale the size of the updates + * depending on some rule, e.g. the frequency of a feature, ... + * This is used at update time. + * TODO: implement a smarter update scale. + */ + weightData[woffset+2] = 1; + } + normalizedValuesData[offset] = (absVal > weightData[woffset] ? THNN_INDEXLINEAR_SIGN(valuesData[offset]):valuesData[offset]*weightData[woffset+1]) + weightData[woffset+3]; + val += normalizedValuesData[offset] * weightData[woffset+maxNormalize]; + offset++; + } + *loutputData += val; + } + } + else + { + /* Parallelize on the batch itself */ +#pragma omp parallel \ + for private(i,j) \ + firstprivate(outDim, weightData, \ + keysData, valuesData, \ + outputData, cumSumSizesData, \ + sizesData) \ + schedule(static) \ + if(keysSize*outDim > THNN_SPARSE_OMP_THRESHOLD && batchSize > 1) + for (j = 0; j < batchSize; j++) + { + long offset = j == 0 ? 0 : cumSumSizesData[j - 1]; + real* loutputData = outputData + j; + real val = 0; + + for (i = 0; i < sizesData[j]; i++) + { + val += weightData[weightStride0*(keysData[offset] + keysOffset)] * valuesData[offset]; + offset++; + } + *loutputData += val; + } + } + } + else { +#pragma omp parallel \ + for private(i,j,k) \ + firstprivate(outDim, weightData, \ + keysData, valuesData, \ + biasData, outputData, \ + cumSumSizesData, sizesData) \ + schedule(static) \ + if(keysSize*outDim > THNN_SPARSE_OMP_THRESHOLD && batchSize > 1) + for (j = 0; j < batchSize; j++) + { + long offset = j == 0 ? 0 : cumSumSizesData[j - 1]; + real val = 0; + real* loutputData = outputData + j*outDim; + real* lweightData = weightData; + memcpy(loutputData, biasData, outDim*sizeof(real)); + for (i = 0; i < sizesData[j]; i++) + { + real val; + long woffset = weightStride0*(keysData[offset] + keysOffset); + if (maxNormalize) + { + val = valuesData[offset]; + real absVal = fabs(val); + if (train) + { + if (absVal > weightData[woffset]) + { + weightData[woffset] = absVal; + weightData[woffset+1] = 1/absVal; + } + + /* + * The following can be used to scale the size of the updates + * depending on some rule, e.g. the frequency of a feature, ... + * The commented section thereafter is just an example of what can be done: + * + *``` + * weightData[woffset+2] = weightData[woffset+2]==0?1:(weightData[woffset+2] / (weightData[woffset+2] + 1)); + * real alpha = 1; + * real beta = 0.01; + * real gamma = 1 - 0.000001; + * real l = weightData[woffset+2]==0?1/gamma:(weightData[woffset+2] - beta) / (alpha - beta); + * l = gamma*l; + * weightData[woffset+2] = (alpha-beta)*l + beta; + * ``` + * + * TODO: implement a smarter update scale. + */ + weightData[woffset+2] = 1; + } + + /* Normalize + Clamp */ + val = (absVal > weightData[woffset] ? THNN_INDEXLINEAR_SIGN(val):val*weightData[woffset+1]) + weightData[woffset+3]; + normalizedValuesData[offset] = val; + + lweightData = weightData + woffset + maxNormalize; + } + else + { + val = valuesData[offset]; + lweightData = weightData + woffset; + } + if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD) + { + THBlas_(axpy)(outDim, val, lweightData, 1, loutputData, 1); + } + else + { + for (k=0; k < outDim; k++) + { + loutputData[k] += lweightData[k] * val; + } + } + offset++; + } + } + } + return; +} + +void THNN_(IndexLinear_updateParameters)( + THNNState *state, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *weight, + THTensor *bias, + THLongTensor *runningKeys, + THLongTensor *cumSumSizes, + long keysOffset, + accreal weightDecay_, + accreal learningRate_) +{ + real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_); + real learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_); + /* Retrieve all the dimensions of the problem */ + long outDim = THTensor_(size)(bias, 0); + long woutDim = THTensor_(size)(weight, 1); + int maxNormalize = woutDim - outDim; + long keysSize = THLongTensor_size(runningKeys, 0); + + /* Access the storage data/strides */ + real* gradWeightData = THTensor_(data)(gradWeight); + real* weightData = THTensor_(data)(weight); + long weightStride0 = weight->stride[0]; + real* gradBiasData = THTensor_(data)(gradBias); + real* biasData = THTensor_(data)(bias); + long* keysData = THLongTensor_data(runningKeys); + + /* Make sure these inputs are contiguous to accelerate computations */ + THArgCheck(THTensor_(isContiguous)(gradWeight), 1, "gradWeight must be contiguous"); + THArgCheck(THTensor_(isContiguous)(gradBias), 2, "gradBias vector must be contiguous"); + THArgCheck(THTensor_(isContiguous)(weight), 3, "gradBias vector must be contiguous"); + THArgCheck(THTensor_(isContiguous)(bias), 4, "gradBias vector must be contiguous"); + THArgCheck(THLongTensor_isContiguous(runningKeys), 5, "keys vector must be contiguous"); + + int j,k; + long offset = 0; + + /* Update the bias first */ + THVector_(cadd)(biasData, biasData, gradBiasData, -learningRate, outDim); + + /* Separate cases: output dimension is == 1, or > 1 + * This allows for some optimizations. + * No multithreading here as this could + * corrupt the results (hogwild style) */ + if (outDim == 1) + { + if (maxNormalize) + { + if (weightDecay) + { + for (j = 0; j < keysSize; j++) + { + long woffset = weightStride0*(keysData[j] + keysOffset) + maxNormalize; + real lr = learningRate*weightData[woffset-2]; + weightData[woffset-1] -= weightData[woffset]*gradWeightData[2*j]*lr; + weightData[woffset] -= gradWeightData[2*j+1]*lr - weightDecay * weightData[woffset-2] * weightData[woffset]; + } + } + else + { + for (j = 0; j < keysSize; j++) + { + long woffset = weightStride0*(keysData[j] + keysOffset) + maxNormalize; + real lr = learningRate*weightData[woffset-2]; + weightData[woffset-1] -= weightData[woffset]*gradWeightData[2*j]*lr; + weightData[woffset] -= gradWeightData[2*j+1]*lr; + } + } + } + else + { + if (weightDecay) + { + for (j = 0; j < keysSize; j++) + { + long woffset = weightStride0*(keysData[j] + keysOffset); + weightData[woffset] -= gradWeightData[j]*learningRate + weightDecay * weightData[woffset]; + } + } + else + { + for (j = 0; j < keysSize; j++) + { + weightData[weightStride0*(keysData[j] + keysOffset)] -= gradWeightData[j]*learningRate; + } + } + } + } + else + { + for (j = 0; j < keysSize; j++) + { + real lr = learningRate; + real wd = weightDecay; + real* lweightData; + long woffset = weightStride0*(keysData[j] + keysOffset); + real* lgradWeightData = gradWeightData + j*outDim; + if (maxNormalize) + { + lgradWeightData += j*outDim; + /* weightData[woffset + 2] */ + lweightData = weightData + woffset + maxNormalize - 2; + lr = lr*lweightData[0]; + wd = weightDecay*lweightData[0]; + /* weightData[woffset + 3] */ + lweightData++; + for (k=0; k < outDim; k++) + { + lweightData[0] -= lgradWeightData[k]*lweightData[k+1]*lr; + } + lweightData++; + lgradWeightData += outDim; + } + else + { + lweightData = weightData + woffset; + } + + /* We do sparse weight decay. + * We think it makes more sense. */ + if (weightDecay) + { + for (k=0; k < outDim; k++) + { + lweightData[k] -= lweightData[k]*wd; + } + } + + if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD) + { + THBlas_(axpy)(outDim, -lr, lgradWeightData, 1, lweightData, 1); + } + else + { + for (k=0; k < outDim; k++) + { + lweightData[k] -= lgradWeightData[k]*lr; + } + } + } + } +} + + +void THNN_(IndexLinear_accUpdateGradParameters)( + THNNState *state, + THLongTensor *keys, + long keysOffset, + THTensor *values, + THLongTensor *sizes, + THLongTensor *cumSumSizes, + THTensor *gradOutput, + THTensor *weight, + THTensor *bias, + accreal weightDecay_, + accreal scale_) +{ + real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_); + real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); + /* Retrieve all the dimensions of the problem */ + long batchSize = THLongTensor_size(sizes, 0); + long keysSize = THLongTensor_size(keys, 0); + long outDim = THTensor_(size)(bias, 0); + long woutDim = THTensor_(size)(weight, 1); + int maxNormalize = woutDim - outDim; + THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements"); + + /* Access the storage data/strides */ + real* gradOutputData = THTensor_(data)(gradOutput); + real* valuesData =THTensor_(data)(values); + real* weightData = THTensor_(data)(weight); + real* biasData = THTensor_(data)(bias); + long weightStride0 = weight->stride[0]; + long biasStride = bias->stride[0]; + long* keysData = THLongTensor_data(keys); + long* sizesData = THLongTensor_data(sizes); + + /* Make sure these inputs are contiguous to accelerate computations */ + THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous"); + THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous"); + THArgCheck(THTensor_(isContiguous)(gradOutput), 6, "gradOutput vector must be contiguous"); + THArgCheck(THTensor_(isContiguous)(weight), 7, "weight matrix must be contiguous"); + THArgCheck(THTensor_(isContiguous)(bias), 8, "bias matrix must be contiguous"); + + int i,j,k; + + /* Separate cases: output dimension is == 1, or > 1 + * This allows for some optimizations. + * No multithreading here as this could + * corrupt the results (hogwild style) */ + if (outDim == 1) + { + if (maxNormalize) + { + long offset = 0; + for (j = 0; j < batchSize; j++) + { + real* lgradOutputData = gradOutputData + j; + *biasData -= *lgradOutputData * scale; + real val = *lgradOutputData * scale; + real* lweightData = weightData; + for (i = 0; i < sizesData[j]; i++) + { + long idx = weightStride0*(keysData[offset] + keysOffset) + maxNormalize; + weightData[idx-1] -= weightData[idx]*val*weightData[idx-2]; + weightData[idx] -= (val*valuesData[offset] - weightDecay * weightData[idx])*weightData[idx-2]; + offset++; + } + } + + offset = 0; + for (j = 0; j < batchSize; j++) + { + real* lweightData = weightData; + for (i = 0; i < sizesData[j]; i++) + { + long idx = weightStride0*(keysData[offset] + keysOffset) + maxNormalize; + weightData[idx-2] = 0; + offset++; + } + } + } + else + { + if (weightDecay) + { + long offset = 0; + for (j = 0; j < batchSize; j++) + { + real* lgradOutputData = gradOutputData + j; + *biasData -= *lgradOutputData * scale; + real val = *lgradOutputData * scale; + real* lweightData = weightData; + for (i = 0; i < sizesData[j]; i++) + { + long idx = weightStride0*(keysData[offset] + keysOffset); + weightData[idx] -= val * valuesData[offset] + weightData[idx] * weightDecay; + offset++; + } + } + } + else + { + long offset = 0; + for (j = 0; j < batchSize; j++) + { + real val = gradOutputData[j] * scale; + for (i = 0; i < sizesData[j]; i++) + { + weightData[(keysData[offset] + keysOffset)*weightStride0] -= val * valuesData[offset]; + offset++; + } + *biasData -= val; + } + } + } + } + else { + long offset = 0; + for (j = 0; j < batchSize; j++) + { + real val = 0; + real* lgradOutputData = gradOutputData + j*outDim; + real* lweightData = weightData; + THVector_(cadd)(biasData, biasData, lgradOutputData, -scale, outDim); + for (i = 0; i < sizesData[j]; i++) + { + real val = valuesData[offset] * scale; + real wd = weightDecay; + + // Max normalize case + if (maxNormalize) + { + lweightData = weightData + weightStride0*(keysData[offset] + keysOffset) + (maxNormalize-2); + val *= lweightData[0]; + wd *= lweightData[0]; + for (k=0; k < outDim; k++) + { + lweightData[1] -= lweightData[k+2]*scale*lgradOutputData[k]*lweightData[0]; + } + lweightData += 2; + } + else + { + lweightData = weightData + weightStride0*(keysData[offset] + keysOffset); + } + + /* We do sparse weight decay. + * We think it makes more sense. */ + if (weightDecay) + { + if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD) + { + THBlas_(axpy)(outDim, -wd, lweightData, 1, lweightData, 1); + } + else + { + for (k=0; k < outDim; k++) + { + lweightData[k] -= wd * lweightData[k]; + } + } + } + + if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD) + { + THBlas_(axpy)(outDim, -val, lgradOutputData, 1, lweightData, 1); + } + else + { + for (k=0; k < outDim; k++) + { + lweightData[k] -= val * lgradOutputData[k]; + } + } + offset++; + } + } + + /* Max Normalize case: + * Reset the smart update scaling if + * one does it batch-wise. + * TODO: Decide what to do with that piece of code. + * NB: If the code belowe is uncommented, so should the commented + * code in IndexLinear:zeroGradParameters() */ + + /* + if (maxNormalize) + { + offset = 0; + for (j = 0; j < batchSize; j++) + { + real* lweightData = weightData; + for (i = 0; i < sizesData[j]; i++) + { + real val = valuesData[offset] * scale; + real wd = weightDecay; + + lweightData = weightData + weightStride0*(keysData[offset] + keysOffset) + (maxNormalize-2); + lweightData[0] = 0; + offset++; + } + } + } + */ + } + return; +} + +void THNN_(IndexLinear_accGradParameters)( + THNNState *state, + THLongTensor *keys, + long keysOffset, + THTensor *values, + THLongTensor *sizes, + THLongTensor *cumSumSizes, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *weight, + THTensor *bias, + THTensor *valuesBuffer, + accreal weightDecay_, + accreal scale_) +{ + real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_); + real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); + /* Retrieve all the dimensions of the problem */ + long batchSize = THLongTensor_size(sizes, 0); + long keysSize = THLongTensor_size(keys, 0); + long outDim = THTensor_(size)(bias, 0); + long woutDim = THTensor_(size)(weight, 1); + long maxNormalize = (woutDim - outDim) > 0 ?1:0; + THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements"); + long* sizesData = THLongTensor_data(sizes); + + /* COmpute the cumulative sizes */ + THLongTensor* cumSizes = THLongTensor_new(); + THLongTensor_cumsum(cumSizes, sizes, 0); + long* cumSizesData = THLongTensor_data(cumSizes); + + /* Resize the gradWeight buffer to keep it dense. + * That speeds up updates A LOT assuming random mem access. */ + THTensor_(resize2d)(gradWeight, keysSize, outDim * (maxNormalize>0?2:1)); + + /* Access the storage data/strides */ + real* gradOutputData = THTensor_(data)(gradOutput); + real* valuesData =THTensor_(data)(values); + real* gradWeightData = THTensor_(data)(gradWeight); + real* weightData = THTensor_(data)(weight); + real* gradBiasData = THTensor_(data)(gradBias); + long gradWeightStride0 = gradWeight->stride[0]; + long weightStride0 = weight->stride[0]; + long* keysData = THLongTensor_data(keys); + + /* Make sure these inputs are contiguous to accelerate computations */ + THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous"); + THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous"); + THArgCheck(THTensor_(isContiguous)(gradOutput), 6, "gradOutput vector must be contiguous"); + THArgCheck(THTensor_(isContiguous)(gradWeight), 7, "gradWeight must be contiguous"); + THArgCheck(THTensor_(isContiguous)(gradBias), 8, "gradBias vector must be contiguous"); + THArgCheck(THTensor_(isContiguous)(weight), 9, "weight must be contiguous"); + THArgCheck(THTensor_(isContiguous)(bias), 10, "bias vector must be contiguous"); + THArgCheck(THTensor_(isContiguous)(valuesBuffer), 11, "valuesBuffer must be contiguous"); + + int i,j,k; + + /* Separate cases: output dimension is == 1, or > 1 + * This allows for some optimizations. + * No multithreading here as this could + * corrupt the results (hogwild style) */ + if (outDim == 1) + { + for (j = 0; j < batchSize; j++) + { + long offset = j==0?0:cumSizesData[j-1]; + real val = gradOutputData[j] * scale; + real* lgradWeightData = gradWeightData + offset; + real* lvaluesData = valuesData + offset; + long end = sizesData[j]; + + if (maxNormalize) + { + lgradWeightData += offset; + i = 0; + for(;i < end; i++) + { + lgradWeightData[2*i] = val; + lgradWeightData[2*i+1] = val * lvaluesData[i]; + } + } + else + { + i = 0; + for(;i < end-4; i += 4) + { + lgradWeightData[i] = val * lvaluesData[i]; + lgradWeightData[i+1] = val * lvaluesData[i+1]; + lgradWeightData[i+2] = val * lvaluesData[i+2]; + lgradWeightData[i+3] = val * lvaluesData[i+3]; + } + + for(; i < end; i++) + { + lgradWeightData[i] = val * lvaluesData[i]; + } + } + *gradBiasData += val; + offset += end; + } + } + else { + for (j = 0; j < batchSize; j++) + { + long offset = j==0?0:cumSizesData[j-1]; + real val = 0; + real* lgradOutputData = gradOutputData + j*outDim; + real* lgradWeightData = gradWeightData; + real* lweightData = weightData; + THVector_(cadd)(gradBiasData, gradBiasData, lgradOutputData, scale, outDim); + for (i = 0; i < sizesData[j]; i++) + { + real val = valuesData[offset] * scale; + lgradWeightData = gradWeightData + offset*outDim; + if (maxNormalize) + { + lgradWeightData += offset*outDim; + k = 0; + for(;k < outDim-4; k += 4) + { + lgradWeightData[k] = lgradOutputData[k]*scale; + lgradWeightData[k+1] = lgradOutputData[k+1]*scale; + lgradWeightData[k+2] = lgradOutputData[k+2]*scale; + lgradWeightData[k+3] = lgradOutputData[k+3]*scale; + } + + for(; k < outDim; k++) + { + lgradWeightData[k] = lgradOutputData[k]*scale; + } + lgradWeightData += outDim; + } + k = 0; + for(;k < outDim-4; k += 4) + { + lgradWeightData[k] = val * lgradOutputData[k]; + lgradWeightData[k+1] = val * lgradOutputData[k+1]; + lgradWeightData[k+2] = val * lgradOutputData[k+2]; + lgradWeightData[k+3] = val * lgradOutputData[k+3]; + } + + for(; k < outDim; k++) + { + lgradWeightData[k] = val * lgradOutputData[k]; + } + offset++; + } + } + } + THLongTensor_free(cumSizes); + return; +} +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/L1Cost.c b/contrib/lua-torch/nn/lib/THNN/generic/L1Cost.c new file mode 100644 index 000000000..53940e894 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/L1Cost.c @@ -0,0 +1,38 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/L1Cost.c" +#else + +void THNN_(L1Cost_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output) +{ + THNN_CHECK_DIM_SIZE(output, 1, 0, 1); + accreal sum = 0; + + TH_TENSOR_APPLY(real, input, + sum += fabs(*input_data); + ); + + THTensor_(set1d)(output, 0, sum); +} + +void THNN_(L1Cost_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput) +{ + THNN_CHECK_NELEMENT(input, gradOutput); + THTensor_(resizeAs)(gradInput, input); + TH_TENSOR_APPLY2(real, gradInput, real, input, + if (*input_data > 0) + *gradInput_data = 1; + else if (*input_data < 0) + *gradInput_data = -1; + else + *gradInput_data = 0; + ); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/LeakyReLU.c b/contrib/lua-torch/nn/lib/THNN/generic/LeakyReLU.c new file mode 100644 index 000000000..074047d83 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/LeakyReLU.c @@ -0,0 +1,57 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/LeakyReLU.c" +#else + +void THNN_(LeakyReLU_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + accreal negval_, + bool inplace) +{ + real negval = TH_CONVERT_ACCREAL_TO_REAL(negval_); + if (inplace) + { + TH_TENSOR_APPLY(real, input, + if (*input_data <= 0) + *input_data *= negval; + ); + THTensor_(set)(output, input); + } + else + { + THTensor_(resizeAs)(output, input); + TH_TENSOR_APPLY2(real, output, real, input, + *output_data = *input_data > 0 ? *input_data : *input_data * negval; + ); + } +} + +void THNN_(LeakyReLU_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + accreal negval_, + bool inplace) +{ + real negval = TH_CONVERT_ACCREAL_TO_REAL(negval_); + THNN_CHECK_NELEMENT(input, gradOutput); + if (inplace) + { + TH_TENSOR_APPLY2(real, gradOutput, real, input, + if (*input_data <= 0) + *gradOutput_data *= negval; + ); + THTensor_(set)(gradInput, gradOutput); + } + else + { + THTensor_(resizeAs)(gradInput, input); + TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input, + *gradInput_data = *input_data > 0 ? *gradOutput_data : *gradOutput_data * negval; + ); + } +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/Linear.c b/contrib/lua-torch/nn/lib/THNN/generic/Linear.c new file mode 100644 index 000000000..8c5cd115e --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/Linear.c @@ -0,0 +1,114 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/Linear.c" +#else + +void THNN_(Linear_updateAddBuffer)( + THNNState *state, + THTensor *input, + THTensor *addBuffer) +{ + long nframe = THTensor_(size)(input,0); + long nElement = THTensor_(nElement)(addBuffer); + if (nElement != nframe) { + THTensor_(resize1d)(addBuffer,nframe); + THTensor_(fill)(addBuffer,1.0); + } +} + +void THNN_(Linear_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + THTensor *addBuffer) +{ + long dim = THTensor_(nDimension)(input); + if (dim == 1) { + THTensor_(resize1d)(output,THTensor_(size)(weight,0)); + if (bias) { + THTensor_(copy)(output,bias); + } + else { + THTensor_(zero)(output); + } + THTensor_(addmv)(output,1,output,1,weight,input); + } + else if (dim == 2) { + long nframe = THTensor_(size)(input,0); + long nElement = THTensor_(nElement)(output); + THTensor_(resize2d)(output,nframe,THTensor_(size)(weight,0)); + if (THTensor_(nElement)(output) != nElement) { + THTensor_(zero)(output); + } + THNN_(Linear_updateAddBuffer)(state,input,addBuffer); + THTensor *tweight = THTensor_(new)(); + THTensor_(transpose)(tweight,weight,0,1); + THTensor_(addmm)(output,0,output,1,input,tweight); + THTensor_(free)(tweight); + if (bias) { + THTensor_(addr)(output,1,output,1,addBuffer,bias); + } + } +} + +void THNN_(Linear_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight) +{ + if (gradInput) { + long nElement = THTensor_(nElement)(gradInput); + THTensor_(resizeAs)(gradInput,input); + if (THTensor_(nElement)(gradInput) != nElement) { + THTensor_(zero)(gradInput); + } + + long dim = THTensor_(nDimension)(input); + if (dim == 1) { + THTensor *tweight = THTensor_(new)(); + THTensor_(transpose)(tweight,weight,0,1); + THTensor_(addmv)(gradInput,0,gradInput,1,tweight,gradOutput); + THTensor_(free)(tweight); + } + else if (dim == 2) { + THTensor_(addmm)(gradInput,0,gradInput,1,gradOutput,weight); + } + } +} + +void THNN_(Linear_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *bias, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *addBuffer, + accreal scale_) +{ + real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); + long dim = THTensor_(nDimension)(input); + if (dim == 1) { + THTensor_(addr)(gradWeight,1,gradWeight,scale,gradOutput,input); + if (bias) { + THTensor_(cadd)(gradBias,gradBias,scale,gradOutput); + } + } + else if (dim == 2) { + THTensor *tgradOutput = THTensor_(new)(); + THTensor_(transpose)(tgradOutput,gradOutput,0,1); + THTensor_(addmm)(gradWeight,1,gradWeight,scale,tgradOutput,input); + if (bias) { + THNN_(Linear_updateAddBuffer)(state,input,addBuffer); + THTensor_(addmv)(gradBias,1,gradBias,scale,tgradOutput,addBuffer); + } + THTensor_(free)(tgradOutput); + } +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/LogSigmoid.c b/contrib/lua-torch/nn/lib/THNN/generic/LogSigmoid.c new file mode 100644 index 000000000..651d56002 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/LogSigmoid.c @@ -0,0 +1,36 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/LogSigmoid.c" +#else + +void THNN_(LogSigmoid_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *buffer) +{ + THTensor_(resizeAs)(output, input); + THTensor_(resizeAs)(buffer, input); + + TH_TENSOR_APPLY3(real, output, real, input, real, buffer, + real z = exp(-*input_data); + *buffer_data = z; + *output_data = -log(1. + z); + ); +} + +void THNN_(LogSigmoid_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *buffer) +{ + THNN_CHECK_NELEMENT(input, gradOutput); + THTensor_(resizeAs)(gradInput, buffer); + TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, buffer, + real z = *buffer_data; + *gradInput_data = *gradOutput_data * z / (1. + z); + ); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/LogSoftMax.c b/contrib/lua-torch/nn/lib/THNN/generic/LogSoftMax.c new file mode 100644 index 000000000..a7280422b --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/LogSoftMax.c @@ -0,0 +1,137 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/LogSoftMax.c" +#else + +void THNN_(LogSoftMax_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output) +{ + real *input_data, *output_data; + ptrdiff_t nframe = 0, dim = 0, stride = 0; + ptrdiff_t t, d; + + if (input->nDimension == 1) + { + nframe = 1; + dim = input->size[0]; + stride = 1; + } + else if (input->nDimension == 2) + { + nframe = input->size[0]; + dim = input->size[1]; + stride = 1; + } + else if (input->nDimension == 3) + { + nframe = 1; + dim = input->size[0]; + stride = input->size[1]*input->size[2]; + } + else if (input->nDimension == 4) + { + nframe = input->size[0]; + dim = input->size[1]; + stride = input->size[2]*input->size[3]; + } + else + THArgCheck(0, 2, "1D, 2D, 3D or 4D tensor expected"); + + input = THTensor_(newContiguous)(input); + THTensor_(resizeAs)(output, input); + + real *input_data0 = THTensor_(data)(input); + real *output_data0 = THTensor_(data)(output); + + accreal logsum; + real maxInput; + #pragma omp parallel for private(t, d, maxInput, logsum, input_data, output_data) + for (t = 0; t < stride*nframe; t++) + { + logsum = 0; + maxInput = -THInf; + input_data = input_data0 + (t/stride)*dim*stride + t % stride; + output_data = output_data0 + (t/stride)*dim*stride + t % stride; + + for (d = 0; d < dim; d++) + maxInput = THMax(maxInput, input_data[d*stride]); + + for (d = 0; d < dim; d++) + logsum += exp(input_data[d*stride] - maxInput); + logsum = maxInput + log(logsum); + + for (d = 0; d < dim; d++) + output_data[d*stride] = input_data[d*stride] - logsum; + } + + THTensor_(free)(input); +} + +void THNN_(LogSoftMax_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *output) +{ + THNN_CHECK_SHAPE(input, gradOutput); + real *gradInput_data, *gradOutput_data, *output_data; + ptrdiff_t nframe = 0, dim = 0, stride = 0; + ptrdiff_t t, d; + + if (output->nDimension == 1) + { + nframe = 1; + dim = output->size[0]; + stride = 1; + } + else if (output->nDimension == 2) + { + nframe = output->size[0]; + dim = output->size[1]; + stride = 1; + } + else if (output->nDimension == 3) + { + nframe = 1; + dim = output->size[0]; + stride = output->size[1]*output->size[2]; + } + else if (output->nDimension == 4) + { + nframe = output->size[0]; + dim = output->size[1]; + stride = output->size[2]*output->size[3]; + } + else + THError("1D, 2D, 3D or 4D tensor expected"); + + output = THTensor_(newContiguous)(output); + gradOutput = THTensor_(newContiguous)(gradOutput); + + THTensor_(resizeAs)(gradInput, output); + real *gradInput_data0 = THTensor_(data)(gradInput); + real *output_data0 = THTensor_(data)(output); + real *gradOutput_data0 = THTensor_(data)(gradOutput); + accreal sum; + #pragma omp parallel for private(t, sum, d, gradInput_data, output_data, gradOutput_data) + for (t = 0; t < stride*nframe; t++) + { + sum = 0; + gradInput_data = gradInput_data0 + (t/stride)*dim*stride + t % stride; + output_data = output_data0 + (t/stride)*dim*stride + t % stride; + gradOutput_data = gradOutput_data0 + (t/stride)*dim*stride + t % stride; + + for (d = 0; d < dim; d++) + sum += gradOutput_data[d*stride]; + + for (d = 0; d < dim; d++) + gradInput_data[d*stride] = gradOutput_data[d*stride] - exp(output_data[d*stride])*sum; + } + + THTensor_(free)(gradOutput); + THTensor_(free)(output); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/LookupTable.c b/contrib/lua-torch/nn/lib/THNN/generic/LookupTable.c new file mode 100644 index 000000000..46bc2c3c1 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/LookupTable.c @@ -0,0 +1,225 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/LookupTable.c" +#else + +static void THNN_(LookupTable_resetCount)( + THInteger_t *count_data, + THIndexTensor *input) +{ + ptrdiff_t i; + THIndex_t *input_data = THIndexTensor_(data)(input); + ptrdiff_t numel = THIndexTensor_(nElement)(input); + + for (i = 0; i<numel; i++) + { + long k = input_data[i] - TH_INDEX_BASE; + count_data[k] = 0; + } + for (i = 0; i<numel; i++) + { + long k = input_data[i] - TH_INDEX_BASE; + count_data[k]++; + } +} + +void THNN_(LookupTable_accGradParameters)( + THNNState *state, + THIndexTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THIntegerTensor *count, + THTensor *sorted, + THIndexTensor *indices, + bool scaleGradByFreq, + int paddingValue, + accreal ascale) +{ + real scale = TH_CONVERT_ACCREAL_TO_REAL(ascale); + ptrdiff_t i; + THInteger_t *count_data = NULL; + + if (scaleGradByFreq) + { + THIntegerTensor_(resize1d)(count, gradWeight->size[0]); + count_data = THIntegerTensor_(data)(count); + } + + if (!THTensor_(isContiguous)(gradWeight)) + THError("gradWeight must be contiguous"); + if (!THIndexTensor_(isContiguous)(input)) + THError("input must be contiguous"); + if (THIndexTensor_(nDimension)(input) != 1 && THIndexTensor_(nDimension)(input) != 2) { + THDescBuff s1 = THIndexTensor_(sizeDesc)(input); + THError("input must be a vector or matrix, but is of shape: %s", s1.str); + } + + THIndex_t *input_data = THIndexTensor_(data)(input); + ptrdiff_t numel = THIndexTensor_(nElement)(input); + long numw = THTensor_(size)(gradWeight, 0); + + // check that inputs are all within range + for (i=0; i<numel; i++) + if (input_data[i] < TH_INDEX_BASE || input_data[i] >= numw + TH_INDEX_BASE) { + THError("inputs need to be in the range %ld <= input < %ld, " + "but got input of value: %ld", TH_INDEX_BASE, (numw + TH_INDEX_BASE), + input_data[i]); + } + + gradOutput = THTensor_(newContiguous)(gradOutput); + + real *gw = THTensor_(data)(gradWeight); + real *go = THTensor_(data)(gradOutput); + long stride = THTensor_(stride)(gradWeight, 0); + + if (count_data) + THNN_(LookupTable_resetCount)(count_data, input); + +#ifdef _OPENMP + if (numel > 1000) + { + // The strategy is to parallelize over sections of the vocabulary, so that + // thread 1 handles updates to gradWeight[0..nVocab/nThreads]. Every thread + // has to traverse the entire input, but the dominating factor is the axpy + // BLAS call. + #pragma omp parallel private(i) + { + int tid = omp_get_thread_num(); + int nthreads = omp_get_num_threads(); + + long start = tid * (numw/nthreads + 1); + long end = start + (numw/nthreads + 1); + for (i=0; i<numel; i++) + { + if (input_data[i] != paddingValue) + { + long k = input_data[i] - TH_INDEX_BASE; + if (k >= start && k < end) + { + real scale_ = scale; + if (count_data) scale_ /= count_data[k]; + THBlas_(axpy)(stride, scale_, go + i*stride, 1, gw + k*stride, 1); + } + } + } + } + + THTensor_(free)(gradOutput); + return; + } +#endif + + for (i=0; i<numel; i++) + { + if (input_data[i] != paddingValue) + { + long k = input_data[i] - TH_INDEX_BASE; + real scale_ = scale; + if (count_data) scale_ /= count_data[k]; + THBlas_(axpy)(stride, scale_, go + i*stride, 1, gw + k*stride, 1); + } + } + + THTensor_(free)(gradOutput); +} + +/* + * Keep the norm of weight smaller than maxNorm + */ + +static void THNN_(LookupTable_renormRow)( + real *row_data, + long stride, + real maxNorm, + real normType) +{ + real norm = 0; + real new_norm; + long j; + for (j=0; j<stride; j++) + { + if (normType == 1) { + norm += fabs(row_data[j]); + } else if (normType == 2) { + norm += row_data[j] * row_data[j]; + } else { + norm += pow(fabs(row_data[j]), normType); + } + } + norm = pow(norm, 1.0 / normType); + if (norm > maxNorm) + { + new_norm = maxNorm / (norm + 1e-7); + for (j=0; j<stride; j++) { + row_data[j] *= new_norm; + } + } +} + +static int THNN_(compare_THIndex)(const void* a, const void* b) +{ + return *(const THIndex_t*)a < *(const THIndex_t*)b ? -1 : 1; +} + +void THNN_(LookupTable_renorm)( + THNNState *state, + THIndexTensor *idx, + THTensor *weight, + accreal maxNorm_, + accreal normType_) +{ + real maxNorm = TH_CONVERT_ACCREAL_TO_REAL(maxNorm_); + real normType = TH_CONVERT_ACCREAL_TO_REAL(normType_); + if (!THTensor_(isContiguous)(weight)) + THError("weight must be contiguous"); + if (!THIndexTensor_(isContiguous)(idx)) + THError("input must be contiguous"); + if (THIndexTensor_(nDimension)(idx) != 1) + THError("idx must be a vector"); + if (normType <= 0) + THError("non-positive-norm not supported"); + + ptrdiff_t i; + THIndex_t *row_idx = THIndexTensor_(data)(idx); + ptrdiff_t numel = THIndexTensor_(nElement)(idx); + + long numw = THTensor_(size)(weight, 0); + long stride = THTensor_(stride)(weight, 0); + real *gw = THTensor_(data)(weight); + for (i=0; i<numel; i++) { + if (row_idx[i] < TH_INDEX_BASE || row_idx[i] >= numw + TH_INDEX_BASE) { + THError("input need to be in the range %ld <= input < %ld, " + "but got input of value: %ld", TH_INDEX_BASE, (numw + TH_INDEX_BASE), + row_idx[i]); + } + } + // get unique indices + qsort(row_idx, numel, sizeof(THIndex_t), THNN_(compare_THIndex)); + ptrdiff_t ptr = 0; + for (i=0; i<numel; i++) + if (i == 0 || row_idx[i] != row_idx[i-1]) + row_idx[ptr++] = row_idx[i]; + numel = ptr; + +#ifdef _OPENMP + if (numel > 1000) + { + // The strategy is to parallelize over the rows that appear in + // row_idx, so that thread 1 handles the rows in row_idx[0..numel/nThreads]. + // This distributes the work evenly to each thread. + #pragma omp parallel for private(i) + for (i=0; i<numel; i++) + { + long k = row_idx[i] - TH_INDEX_BASE; + THNN_(LookupTable_renormRow)(gw + k*stride, stride, maxNorm, normType); + } + return; + } +#endif + for (i=0; i<numel; i++) + { + long k = row_idx[i] - TH_INDEX_BASE; + THNN_(LookupTable_renormRow)(gw + k*stride, stride, maxNorm, normType); + } +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/MSECriterion.c b/contrib/lua-torch/nn/lib/THNN/generic/MSECriterion.c new file mode 100644 index 000000000..58911f6f0 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/MSECriterion.c @@ -0,0 +1,45 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/MSECriterion.c" +#else + +void THNN_(MSECriterion_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *target, + THTensor *output, + bool sizeAverage) +{ + THNN_CHECK_NELEMENT(input, target); + THNN_CHECK_DIM_SIZE(output, 1, 0, 1); + + real sum = 0; + + TH_TENSOR_APPLY2(real, input, real, target, + real z = (*input_data - *target_data); + sum += z*z; + ); + + if (sizeAverage) + sum /= THTensor_(nElement)(input); + + THTensor_(set1d)(output, 0, sum); +} + +void THNN_(MSECriterion_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *target, + THTensor *gradInput, + bool sizeAverage) +{ + THNN_CHECK_NELEMENT(input, target); + + real norm = (sizeAverage ? 2./((real)THTensor_(nElement)(input)) : 2.); + + THTensor_(resizeAs)(gradInput, input); + TH_TENSOR_APPLY3(real, gradInput, real, input, real, target, + *gradInput_data = norm * (*input_data - *target_data); + ); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/MarginCriterion.c b/contrib/lua-torch/nn/lib/THNN/generic/MarginCriterion.c new file mode 100644 index 000000000..d6d9b60b9 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/MarginCriterion.c @@ -0,0 +1,47 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/MarginCriterion.c" +#else + +void THNN_(MarginCriterion_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *target, + THTensor *output, + bool sizeAverage, + accreal margin_) +{ + real margin = TH_CONVERT_ACCREAL_TO_REAL(margin_); + THNN_CHECK_NELEMENT(input, target); + THNN_CHECK_DIM_SIZE(output, 1, 0, 1); + real sum = 0; + + TH_TENSOR_APPLY2(real, input, real, target, + real z = (margin - *input_data * *target_data); + sum += z>0 ? z : 0; + ); + + if (sizeAverage) + sum /= THTensor_(nElement)(input); + + THTensor_(set1d)(output, 0, sum); +} + +void THNN_(MarginCriterion_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *target, + THTensor *gradInput, + bool sizeAverage, + accreal margin_) +{ + real margin = TH_CONVERT_ACCREAL_TO_REAL(margin_); + THNN_CHECK_NELEMENT(input, target); + real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.); + + THTensor_(resizeAs)(gradInput, input); + TH_TENSOR_APPLY3(real, gradInput, real, input, real, target, + *gradInput_data = (*input_data * *target_data) < margin ? -norm * *target_data : 0; + ); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/MultiLabelMarginCriterion.c b/contrib/lua-torch/nn/lib/THNN/generic/MultiLabelMarginCriterion.c new file mode 100644 index 000000000..16398c13c --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/MultiLabelMarginCriterion.c @@ -0,0 +1,184 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/MultiLabelMarginCriterion.c" +#else + +// TODO: improve error messages +void THNN_(MultiLabelMarginCriterion_updateOutput)( + THNNState *state, + THTensor *input, + THIndexTensor *target, + THTensor *output, + THTensor *isTarget, + bool sizeAverage) +{ + real *input_data, *isTarget_data; + THIndex_t *target_data; + long nframe, dim; + long t, d, dt, ddt; + real sum; + + THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2, + "vector or matrix expected"); + + if (input->nDimension == 1) + { + nframe = 1; + dim = input->size[0]; + THArgCheck((target->nDimension == 1) && (target->size[0] == dim), 3, + "inconsistent target size"); + } + else + { + nframe = input->size[0]; + dim = input->size[1]; + THArgCheck((target->nDimension == 2) && (target->size[0] == nframe) + && (target->size[1] == dim), 3, "inconsistent target size"); + } + + THArgCheck(THIndexTensor_(minall)(target) >= -1+TH_INDEX_BASE, 3, "target out of range"); + THArgCheck(THIndexTensor_(maxall)(target) < dim+TH_INDEX_BASE, 3, "target out of range"); + + target = THIndexTensor_(newContiguous)(target); + input = THTensor_(newContiguous)(input); + input_data = THTensor_(data)(input); + target_data = THIndexTensor_(data)(target); + + THNN_resizeAs_indices(isTarget, target); + THTensor_(zero)(isTarget); + isTarget_data = THTensor_(data)(isTarget); + + sum = 0; + for (t = 0; t < nframe; t++) + { + for (ddt = 0; ddt < dim; ddt++) + { + THIndex_t target_idx = target_data[ddt] - TH_INDEX_BASE; + if (target_idx < 0) + break; + isTarget_data[target_idx] = 1; + } + for (dt = 0; dt < dim; dt++) + { + THIndex_t target_idx = target_data[dt] - TH_INDEX_BASE; + real input_target; + if (target_idx < 0) + break; + + input_target = input_data[target_idx]; + for (d = 0; d < dim; d++) + { + if (!isTarget_data[d]) + { + real z = 1 - input_target + input_data[d]; + if (z > 0) + sum += z; + } + } + } + input_data += dim; + target_data += dim; + isTarget_data += dim; + } + + sum /= dim; + if (sizeAverage) + sum /= nframe; + + THTensor_(set1d)(output, 0, sum); + + THTensor_(free)(input); + THIndexTensor_(free)(target); +} + +void THNN_(MultiLabelMarginCriterion_updateGradInput)( + THNNState *state, + THTensor *input, + THIndexTensor *target, + THTensor *gradInput, + THTensor *isTarget, + bool sizeAverage) +{ + real *input_data; + real *gradInput_data; + THIndex_t *target_data; + real *isTarget_data; + long nframe, dim; + long t, d, dt; + real g; + + THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2, + "vector or matrix expected"); + + if (input->nDimension == 1) + { + nframe = 1; + dim = input->size[0]; + THArgCheck((target->nDimension == 1) && (target->size[0] == dim), 3, + "inconsistent target size"); + THArgCheck((isTarget->nDimension == 1) && (isTarget->size[0] == dim), 3, + "inconsistent isTarget size"); + } + else + { + nframe = input->size[0]; + dim = input->size[1]; + THArgCheck((target->nDimension == 2) && (target->size[0] == nframe) + && (target->size[1] == dim), 3, "inconsistent target size"); + THArgCheck((isTarget->nDimension == 2) && (isTarget->size[0] == nframe) + && (isTarget->size[1] == dim), 3, "inconsistent isTarget size"); + } + + THArgCheck(THIndexTensor_(minall)(target) >= -1+TH_INDEX_BASE, 3, "target out of range"); + THArgCheck(THIndexTensor_(maxall)(target) < dim+TH_INDEX_BASE, 3, "target out of range"); + + THArgCheck(THTensor_(minall)(isTarget) >= 0, 3, "isTarget out of range"); + THArgCheck(THTensor_(maxall)(isTarget) <= 1, 3, "isTarget out of range"); + + target = THIndexTensor_(newContiguous)(target); + input = THTensor_(newContiguous)(input); + isTarget = THTensor_(newContiguous)(isTarget); + input_data = THTensor_(data)(input); + target_data = THIndexTensor_(data)(target); + isTarget_data = THTensor_(data)(isTarget); + + g = sizeAverage ? ( 1./((real)(nframe*dim)) ) : ( 1./((real)dim) ); + + THTensor_(resizeAs)(gradInput, input); + THTensor_(zero)(gradInput); + gradInput_data = THTensor_(data)(gradInput); + + for (t = 0; t < nframe; t++) + { + for (dt = 0; dt < dim; dt++) + { + THIndex_t target_idx = target_data[dt] - TH_INDEX_BASE; + real input_target; + if (target_idx < 0) + break; + + input_target = input_data[target_idx]; + for (d = 0; d < dim; d++) + { + if (!isTarget_data[d]) + { + real z = 1 - input_target + input_data[d]; + if (z > 0) + { + gradInput_data[target_idx] -= g; + gradInput_data[d] += g; + } + } + } + } + input_data += dim; + target_data += dim; + isTarget_data += dim; + gradInput_data += dim; + } + + THTensor_(free)(input); + THIndexTensor_(free)(target); + THTensor_(free)(isTarget); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/MultiMarginCriterion.c b/contrib/lua-torch/nn/lib/THNN/generic/MultiMarginCriterion.c new file mode 100644 index 000000000..2f8f8ff58 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/MultiMarginCriterion.c @@ -0,0 +1,168 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/MultiMarginCriterion.c" +#else + +// TODO: improve error messages +void THNN_(MultiMarginCriterion_updateOutput)( + THNNState *state, + THTensor *input, + THIndexTensor *target, + THTensor *output, + bool sizeAverage, + int p, + THTensor *weights, + accreal margin_) +{ + real margin = TH_CONVERT_ACCREAL_TO_REAL(margin_); + real *input_data, *weights_data; + THIndex_t *target_data; + long nframe, dim; + long t, d; + real sum; + + THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2, + "vector or matrix expected"); + + if (input->nDimension == 1) + { + nframe = 1; + dim = input->size[0]; + } + else + { + nframe = input->size[0]; + dim = input->size[1]; + THArgCheck((target->nDimension == 1) && (target->size[0] == nframe), 3, + "inconsistent target size"); + } + + for (t = 0; t < nframe; t++) + { + THIndex_t idx = THIndexTensor_(get1d)(target, t); + THArgCheck((idx >= TH_INDEX_BASE) && (idx < dim + TH_INDEX_BASE), 3, + "target out of range"); + } + + input = THTensor_(newContiguous)(input); + target = THIndexTensor_(newContiguous)(target); + weights = weights ? THTensor_(newContiguous)(weights) : NULL; + input_data = THTensor_(data)(input); + target_data = THIndexTensor_(data)(target); + weights_data = weights ? THTensor_(data)(weights) : NULL; + + sum = 0; + for (t = 0; t < nframe; t++) + { + THIndex_t target_idx = target_data[t] - TH_INDEX_BASE; + real input_target = input_data[target_idx]; + for (d = 0; d < dim; d++) + { + real z = margin - input_target + input_data[d]; + if (d == target_idx) + continue; + + if (z > 0) { + real h = (p==1) ? z : z*z; + if(weights_data) + h *= weights_data[target_idx]; + sum += h; + } + } + input_data += dim; + } + + sum /= dim; + if(sizeAverage) + sum /= nframe; + + THTensor_(set1d)(output, 0, sum); + + THTensor_(free)(input); + THIndexTensor_(free)(target); + if(weights) + THTensor_(free)(weights); +} + +void THNN_(MultiMarginCriterion_updateGradInput)( + THNNState *state, + THTensor *input, + THIndexTensor *target, + THTensor *gradInput, + bool sizeAverage, + int p, + THTensor *weights, + accreal margin_) +{ + real margin = TH_CONVERT_ACCREAL_TO_REAL(margin_); + real *input_data; + real *gradInput_data; + THIndex_t *target_data; + real *weights_data; + long nframe, dim; + long t, d; + real g; + + THArgCheck((input->nDimension == 1) || (input->nDimension == 2), 2, + "vector or matrix expected"); + + if (input->nDimension == 1) + { + nframe = 1; + dim = input->size[0]; + } + else + { + nframe = input->size[0]; + dim = input->size[1]; + THArgCheck((target->nDimension == 1) && (target->size[0] == nframe), 3, + "inconsistent target size"); + } + + g = (sizeAverage ? 1./((real)(nframe*dim)) : 1./((real)dim)); + + input = THTensor_(newContiguous)(input); + target = THIndexTensor_(newContiguous)(target); + input_data = THTensor_(data)(input); + + THTensor_(resizeAs)(gradInput, input); + gradInput_data = THTensor_(data)(gradInput); + + target_data = THIndexTensor_(data)(target); + weights = weights ? THTensor_(newContiguous)(weights) : NULL; + weights_data = weights ? THTensor_(data)(weights) : NULL; + + for (t = 0; t < nframe; t++) + { + THIndex_t target_idx = target_data[t] - TH_INDEX_BASE; + real input_target = input_data[target_idx]; + real gradInput_target = 0; + for (d = 0; d < dim; d++) + { + real z = margin - input_target + input_data[d]; + if (d == target_idx) + continue; + + if (z > 0) + { + real h = (p == 1) ? g : 2*g*z; + if(weights_data) + h *= weights_data[target_idx]; + gradInput_target -= h; + gradInput_data[d] = h; + } + else + gradInput_data[d] = 0; + } + gradInput_data[target_idx] = gradInput_target; + + input_data += dim; + gradInput_data += dim; + } + + THTensor_(free)(input); + THIndexTensor_(free)(target); + if(weights) + THTensor_(free)(weights); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/PReLU.c b/contrib/lua-torch/nn/lib/THNN/generic/PReLU.c new file mode 100644 index 000000000..488322fde --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/PReLU.c @@ -0,0 +1,207 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/PReLU.c" +#else + +void THNN_(PReLU_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THIndex_t nOutputPlane) +{ + THTensor_(resizeAs)(output, input); + + if (nOutputPlane == 0) + { + // handle shared parameter case + real w = *THTensor_(data)(weight); + TH_TENSOR_APPLY2(real, output, real, input, + *output_data = (*input_data > 0) ? *input_data : w*(*input_data); + ); + } + else + { + input = THTensor_(newContiguous)(input); + long bs = 1, ks = 1; + { + long input_ndim = THTensor_(nDimension)(input); + if (input->size[input_ndim > 1] != nOutputPlane) + THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size[input_ndim > 1]); + + if (input_ndim > 1) { + bs = input->size[0]; + for (int d = 2; d < input_ndim; d++) { + ks *= input->size[d]; + } + } + } + + real *output_data = THTensor_(data)(output); + real *input_data = THTensor_(data)(input); + real *weight_data = THTensor_(data)(weight); + THIndex_t i, j, k; +#pragma omp parallel for private(j,k) + for (i = 0; i < bs; ++i) + { + real* n_input_data = input_data + i*nOutputPlane*ks; + real* n_output_data = output_data + i*nOutputPlane*ks; + for (j = 0; j < nOutputPlane; ++j) + { + for (k = 0; k < ks; ++k) + n_output_data[k] = (n_input_data[k] > 0) ? n_input_data[k] : weight_data[j] * n_input_data[k]; + n_input_data += ks; + n_output_data += ks; + } + } + THTensor_(free)(input); + } +} + +void THNN_(PReLU_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THIndex_t nOutputPlane) +{ + THNN_CHECK_NELEMENT(input, gradOutput); + THTensor_(resizeAs)(gradInput, input); + + if (nOutputPlane == 0) + { + real w = THTensor_(data)(weight)[0]; + TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input, + if ((*input_data) > 0) + *gradInput_data = *gradOutput_data; + else + *gradInput_data = w * (*gradOutput_data); + ); + } + else + { + input = THTensor_(newContiguous)(input); + gradOutput = THTensor_(newContiguous)(gradOutput); + weight = THTensor_(newContiguous)(weight); + const real *input_data = THTensor_(data)(input); + const real *gradOutput_data = THTensor_(data)(gradOutput); + const real *weight_data = THTensor_(data)(weight); + real *gradInput_data = THTensor_(data)(gradInput); + + long bs = 1, ks = 1; + { + long input_ndim = THTensor_(nDimension)(input); + if (input->size[input_ndim > 1] != nOutputPlane) + THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size[input_ndim > 1]); + + if (input_ndim > 1) { + bs = input->size[0]; + for (int d = 2; d < input_ndim; d++) { + ks *= input->size[d]; + } + } + } + + THIndex_t i, j, k; +#pragma omp parallel for private(j,k) + for (i = 0; i < bs; ++i) + { + const real *n_input_data = input_data + i*nOutputPlane*ks; + const real *n_gradOutput_data = gradOutput_data + i*nOutputPlane*ks; + real *n_gradInput_data = gradInput_data + i*nOutputPlane*ks; + + for (j = 0; j < nOutputPlane; ++j) + { + real w = weight_data[j]; + for (k = 0; k < ks; ++k) + { + if (n_input_data[k] > 0) + n_gradInput_data[k] = n_gradOutput_data[k]; + else + n_gradInput_data[k] = n_gradOutput_data[k] * w; + } + n_input_data += ks; + n_gradInput_data += ks; + n_gradOutput_data += ks; + } + } + THTensor_(free)(input); + THTensor_(free)(gradOutput); + THTensor_(free)(weight); + } +} + +void THNN_(PReLU_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *gradWeight, + THTensor *gradWeightBuf, + THTensor *gradWeightBuf2, + THIndex_t nOutputPlane, + accreal scale_) +{ + real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); + THNN_CHECK_NELEMENT(input, gradOutput); + + if (nOutputPlane == 0) + { + real *gradWeight_data = THTensor_(data)(gradWeight); + real sum = 0; + TH_TENSOR_APPLY2(real, input, real, gradOutput, + if ((*input_data) <= 0) + sum += (*input_data) * (*gradOutput_data); + ); + gradWeight_data[0] += scale * sum; + } + else + { + THArgCheck(THTensor_(isContiguous)(gradWeight), 6, "gradWeight needs to be contiguous"); + input = THTensor_(newContiguous)(input); + gradOutput = THTensor_(newContiguous)(gradOutput); + weight = THTensor_(newContiguous)(weight); + long bs = 1, ks = 1; + { + long input_ndim = THTensor_(nDimension)(input); + if (input->size[input_ndim > 1] != nOutputPlane) + THError("Wrong number of input planes. Expected %d but got %d.", nOutputPlane, input->size[input_ndim > 1]); + + if (input_ndim > 1) { + bs = input->size[0]; + for (int d = 2; d < input_ndim; d++) { + ks *= input->size[d]; + } + } + } + + const real *input_data = THTensor_(data)(input); + const real *gradOutput_data = THTensor_(data)(gradOutput); + const real *weight_data = THTensor_(data)(weight); + real *gradWeight_data = THTensor_(data)(gradWeight); + + THIndex_t i, j, k; + for (i = 0; i < bs; ++i) + { + const real *n_input_data = input_data + i*nOutputPlane*ks; + const real *n_gradOutput_data = gradOutput_data + i*nOutputPlane*ks; + + for (j = 0; j < nOutputPlane; ++j) + { + real sum = 0; + for (k = 0; k < ks; ++k) + if (n_input_data[k] <= 0) + sum += n_gradOutput_data[k] * n_input_data[k]; + gradWeight_data[j] += scale * sum; + n_input_data += ks; + n_gradOutput_data += ks; + } + } + THTensor_(free)(input); + THTensor_(free)(gradOutput); + THTensor_(free)(weight); + } +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/RReLU.c b/contrib/lua-torch/nn/lib/THNN/generic/RReLU.c new file mode 100644 index 000000000..8fd46d3c2 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/RReLU.c @@ -0,0 +1,132 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/RReLU.c" +#else + +void THNN_(RReLU_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *noise, + accreal lower_, + accreal upper_, + bool train, + bool inplace, + THGenerator *generator) +{ + real lower = TH_CONVERT_ACCREAL_TO_REAL(lower_); + real upper = TH_CONVERT_ACCREAL_TO_REAL(upper_); + if (train) + { + // get default random generator + THTensor_(resizeAs)(noise, input); + if (inplace) + { + TH_TENSOR_APPLY2(real, input, real, noise, + if (*input_data <= 0) + { + const real r = (real)THRandom_uniform(generator, lower, upper); + *input_data = (*input_data) * r; + *noise_data = r; + } + else + { + *noise_data = 1; + } + ); + THTensor_(set)(output, input); + } + else + { + THTensor_(resizeAs)(output, input); + TH_TENSOR_APPLY3(real, input, real, output, real, noise, + if (*input_data <= 0) + { + const real r = (real)THRandom_uniform(generator, lower, upper); + *output_data = (*input_data) * r; + *noise_data = r; + } + else + { + *output_data = *input_data; + *noise_data = 1; + } + ); + } + } + else + { + const real negSlope = (lower + upper) / 2; + if (inplace) + { + TH_TENSOR_APPLY(real, input, + if (*input_data <= 0) + { + *input_data = *input_data * negSlope; + } + ); + THTensor_(set)(output, input); + } + else + { + THTensor_(resizeAs)(output, input); + TH_TENSOR_APPLY2(real, input, real, output, + const real r = (*input_data) <= 0 ? negSlope : 1; + *output_data = *input_data * r; + ); + } + } +} + +void THNN_(RReLU_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *noise, + accreal lower_, + accreal upper_, + bool train, + bool inplace) +{ + real lower = TH_CONVERT_ACCREAL_TO_REAL(lower_); + real upper = TH_CONVERT_ACCREAL_TO_REAL(upper_); + THNN_CHECK_NELEMENT(input, gradOutput); + if (train && upper - lower > 1E-6) // e.g. if upper == lower, RReLU behaves like LeakyReLU + { + // multiply the gradient by the noise tensor + if (inplace) + { + THTensor_(cmul)(gradOutput, gradOutput, noise); + THTensor_(set)(gradInput, gradOutput); + } + else + { + THTensor_(resizeAs)(gradInput, input); + THTensor_(cmul)(gradInput, gradOutput, noise); + } + } + else + { + // use constant factor for negative input values + const real negSlope = (lower + upper) / 2; + if (inplace) + { + TH_TENSOR_APPLY2(real, gradOutput, real, input, + if (*input_data <= 0) + { + *gradOutput_data = (*gradOutput_data) * negSlope; + } + ); + THTensor_(set)(gradInput, gradOutput); + } + else + { + THTensor_(resizeAs)(gradInput, input); + TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input, + *gradInput_data = (*input_data) <= 0 ? (*gradOutput_data) * negSlope : (*gradOutput_data); + ); + } + } +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/Sigmoid.c b/contrib/lua-torch/nn/lib/THNN/generic/Sigmoid.c new file mode 100644 index 000000000..17fb2cb4d --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/Sigmoid.c @@ -0,0 +1,28 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/Sigmoid.c" +#else + +void THNN_(Sigmoid_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output) +{ + THTensor_(sigmoid)(output, input); +} + +void THNN_(Sigmoid_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *output) +{ + THNN_CHECK_NELEMENT(output, gradOutput); + THTensor_(resizeAs)(gradInput, output); + TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output, + real z = *output_data; + *gradInput_data = *gradOutput_data * (1. - z) * z; + ); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SmoothL1Criterion.c b/contrib/lua-torch/nn/lib/THNN/generic/SmoothL1Criterion.c new file mode 100644 index 000000000..d1928d11c --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/SmoothL1Criterion.c @@ -0,0 +1,49 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SmoothL1Criterion.c" +#else + +void THNN_(SmoothL1Criterion_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *target, + THTensor *output, + bool sizeAverage) +{ + THNN_CHECK_NELEMENT(input, target); + THNN_CHECK_DIM_SIZE(output, 1, 0, 1); + + real sum = 0; + TH_TENSOR_APPLY2(real, input, real, target, + real z = fabs(*input_data - *target_data); + sum += z < 1 ? 0.5*z*z : z - 0.5; + ); + + if (sizeAverage) + sum /= THTensor_(nElement)(input); + + THTensor_(set1d)(output, 0, sum); +} + +void THNN_(SmoothL1Criterion_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *target, + THTensor *gradInput, + bool sizeAverage) +{ + THNN_CHECK_NELEMENT(input, target); + real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.); + + THTensor_(resizeAs)(gradInput, input); + TH_TENSOR_APPLY3(real, gradInput, real, input, real, target, + real x = *input_data - *target_data; + if (x < -1.) + *gradInput_data = - norm; + else if (x > 1.) + *gradInput_data = norm; + else + *gradInput_data = norm * x; + ); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SoftMarginCriterion.c b/contrib/lua-torch/nn/lib/THNN/generic/SoftMarginCriterion.c new file mode 100644 index 000000000..bac0a3b53 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/SoftMarginCriterion.c @@ -0,0 +1,44 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SoftMarginCriterion.c" +#else + +void THNN_(SoftMarginCriterion_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *target, + THTensor *output, + bool sizeAverage) +{ + THNN_CHECK_NELEMENT(input, target); + THNN_CHECK_DIM_SIZE(output, 1, 0, 1); + + real sum; + + sum = 0; + TH_TENSOR_APPLY2(real, input, real, target, + real z = log(1. + exp(-*input_data* *target_data)); + sum += z;) + + if(sizeAverage) + sum /= THTensor_(nElement)(input); + + THTensor_(set1d)(output, 0, sum); +} + +void THNN_(SoftMarginCriterion_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *target, + THTensor *gradInput, + bool sizeAverage) +{ + THNN_CHECK_NELEMENT(input, target); + real norm = (sizeAverage ? 1./((real)THTensor_(nElement)(input)) : 1.); + + THTensor_(resizeAs)(gradInput, input); + TH_TENSOR_APPLY3(real, gradInput, real, input, real, target, + real z = exp(-*target_data * *input_data); + *gradInput_data = -norm*(*target_data)*z/(1. + z);) +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SoftMax.c b/contrib/lua-torch/nn/lib/THNN/generic/SoftMax.c new file mode 100644 index 000000000..7b60d64c2 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/SoftMax.c @@ -0,0 +1,150 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SoftMax.c" +#else + +void THNN_(SoftMax_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output) +{ + real *input_data, *output_data; + ptrdiff_t nframe = 0, dim = 0, stride = 0; + ptrdiff_t t; + + if (input->nDimension == 1) + { + nframe = 1; + dim = input->size[0]; + stride = 1; + } + else if (input->nDimension == 2) + { + nframe = input->size[0]; + dim = input->size[1]; + stride = 1; + } + else if (input->nDimension == 3) + { + nframe = 1; + dim = input->size[0]; + stride = input->size[1]*input->size[2]; + } + else if (input->nDimension == 4) + { + nframe = input->size[0]; + dim = input->size[1]; + stride = input->size[2]*input->size[3]; + } + else + { + THArgCheck(0, 2, "1D, 2D, 3D or 4D tensor expected"); + } + + input = THTensor_(newContiguous)(input); + THTensor_(resizeAs)(output, input); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + +#pragma omp parallel for private(t) + for (t = 0; t < stride*nframe; t++) + { + real *input_ptr = input_data + (t/stride)*dim*stride + t % stride; + real *output_ptr = output_data + (t/stride)*dim*stride + t % stride; + + real inputMax = -THInf; + accreal sum; + + ptrdiff_t d; + for (d = 0; d < dim; d++) + { + if (input_ptr[d*stride] >= inputMax) inputMax = input_ptr[d*stride]; + } + + sum = 0; + for (d = 0; d < dim; d++) + { + real z = exp(input_ptr[d*stride] - inputMax); + output_ptr[d*stride] = z; + sum += z; + } + + for (d = 0; d < dim; d++) + { + output_ptr[d*stride] *= 1/sum; + } + } + + THTensor_(free)(input); +} + +void THNN_(SoftMax_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *output) +{ + THNN_CHECK_SHAPE(input, gradOutput); + real *gradInput_data, *gradOutput_data, *output_data; + ptrdiff_t nframe = 0, dim = 0, stride = 0; + ptrdiff_t t; + + if (output->nDimension == 1) + { + nframe = 1; + dim = output->size[0]; + stride = 1; + } + else if (output->nDimension == 2) + { + nframe = output->size[0]; + dim = output->size[1]; + stride = 1; + } + else if (output->nDimension == 3) + { + nframe = 1; + dim = output->size[0]; + stride = output->size[1]*output->size[2]; + } + else if (output->nDimension == 4) + { + nframe = output->size[0]; + dim = output->size[1]; + stride = output->size[2]*output->size[3]; + } + else + { + THError("1D, 2D, 3D or 4D tensor expected"); + } + + gradOutput = THTensor_(newContiguous)(gradOutput); + output = THTensor_(newContiguous)(output); + + THTensor_(resizeAs)(gradInput, output); + gradInput_data = THTensor_(data)(gradInput); + output_data = THTensor_(data)(output); + gradOutput_data = THTensor_(data)(gradOutput); + +#pragma omp parallel for private(t) + for (t = 0; t < stride*nframe; t++) + { + real *gradInput_ptr = gradInput_data + (t/stride)*dim*stride + t % stride; + real *output_ptr = output_data + (t/stride)*dim*stride + t % stride; + real *gradOutput_ptr = gradOutput_data + (t/stride)*dim*stride + t % stride; + + ptrdiff_t d; + accreal sum = 0; + for (d = 0; d < dim; d++) + sum += (accreal)gradOutput_ptr[d*stride] * output_ptr[d*stride]; + + for (d = 0; d < dim; d++) + gradInput_ptr[d*stride] = output_ptr[d*stride] * (gradOutput_ptr[d*stride] - sum); + } + + THTensor_(free)(gradOutput); + THTensor_(free)(output); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SoftPlus.c b/contrib/lua-torch/nn/lib/THNN/generic/SoftPlus.c new file mode 100644 index 000000000..6491e66d6 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/SoftPlus.c @@ -0,0 +1,47 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SoftPlus.c" +#else + +void THNN_(SoftPlus_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + accreal beta_, + accreal threshold_) +{ + real beta = TH_CONVERT_ACCREAL_TO_REAL(beta_); + real threshold = TH_CONVERT_ACCREAL_TO_REAL(threshold_); + THTensor_(resizeAs)(output, input); + + // f(x) = 1/beta * log(1 + exp(beta * x)) + TH_TENSOR_APPLY2(real, output, real, input, \ + *output_data = (*input_data * beta) > threshold ? *input_data : THLog1p(exp(*input_data * beta)) / beta; + ); +} + +void THNN_(SoftPlus_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *output, + accreal beta_, + accreal threshold_) +{ + real beta = TH_CONVERT_ACCREAL_TO_REAL(beta_); + real threshold = TH_CONVERT_ACCREAL_TO_REAL(threshold_); + THNN_CHECK_NELEMENT(input, gradOutput); + THTensor_(resizeAs)(gradInput, output); + + // d/dx[log(1+exp(k*x))/k] = exp(kx) / (exp(kx) + 1) + // SINCE + // y = (1/k)*log(1+exp(k*x)) --> x = (1/k)*log(exp(k*y)-1) + // THEREFORE: + // d/dx(f(x)) = (exp(k*y) - 1) / exp(k*y) + TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output, + real z = exp(*output_data * beta); + *gradInput_data = (*output_data * beta) > threshold ? *gradOutput_data : *gradOutput_data * (z - 1.)/z; + ); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SoftShrink.c b/contrib/lua-torch/nn/lib/THNN/generic/SoftShrink.c new file mode 100644 index 000000000..e77950868 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/SoftShrink.c @@ -0,0 +1,42 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SoftShrink.c" +#else + +void THNN_(SoftShrink_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + accreal lambda_) +{ + real lambda = TH_CONVERT_ACCREAL_TO_REAL(lambda_); + THTensor_(resizeAs)(output, input); + + TH_TENSOR_APPLY2(real, output, real, input, + if ((*input_data) > lambda) + *output_data = *input_data - lambda; + else if ((*input_data) < -lambda) + *output_data = *input_data + lambda; + else + *output_data = 0; + ); +} + +void THNN_(SoftShrink_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + accreal lambda_) +{ + real lambda = TH_CONVERT_ACCREAL_TO_REAL(lambda_); + THNN_CHECK_NELEMENT(input, gradOutput); + THTensor_(resizeAs)(gradInput, input); + TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input, + if ((*input_data) > lambda || (*input_data) < -lambda) + *gradInput_data = (*gradOutput_data); + else + *gradInput_data = 0; + ); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SparseLinear.c b/contrib/lua-torch/nn/lib/THNN/generic/SparseLinear.c new file mode 100644 index 000000000..1cf712212 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/SparseLinear.c @@ -0,0 +1,564 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SparseLinear.c" +#else + +#ifdef _OPENMP +#include <omp.h> +#endif + +#define ROW_PTR2(t, r) (THTensor_(data)(t) + (r) * (t)->stride[0]) +#define COL_PTR2(t, c) (THTensor_(data)(t) + (c) * (t)->stride[1]) + +static bool THNN_(checkLegacyInput)(THTensor* t) +{ + return t->nDimension == 3 && t->size[2] == 2; +} + +static bool THNN_(checkInput)(THTensor* t) +{ + return t->nDimension == 2 && t->size[1] == 3; +} + +static bool THNN_(checkSize2D)(THTensor* t, long size0, long size1) +{ + return t->nDimension == 2 && t->size[0] == size0 && t->size[1] == size1; +} + +static bool THNN_(checkSize1D)(THTensor* t, long size0) +{ + return t->nDimension == 1 && t->size[0] == size0; +} + +static void THNN_(set1d)(THTensor *t, long x0, real value) { + THStorage_(set)(t->storage, t->storageOffset + x0*t->stride[0], value); +} +static real THNN_(get3d)(const THTensor *t, long x0, long x1, long x2) { + return THStorage_(get)(t->storage, t->storageOffset + + x0*t->stride[0] + x1*t->stride[1] + x2*t->stride[2]); +} +static real THNN_(get2d)(const THTensor *t, long x0, long x1) { + return THStorage_(get)(t->storage, t->storageOffset + + x0*t->stride[0] + x1*t->stride[1]); +} + +void THNN_(SparseLinear_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias) +{ + long h, i, j, hp0, hp1; + long outDim = THTensor_(size)(weight, 0); + long inDim = THTensor_(size)(weight, 1); + long batchSize = THTensor_(size)(output, 0); + + THArgCheck(THNN_(checkInput)(input), 2, "input must be in coo format, nnz x 3"); + THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous"); + THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong"); + + long nnz = THTensor_(size)(input, 0); + + THLongTensor * csr = THLongTensor_newWithSize1d(batchSize+1); + THLongTensor_zero(csr); + + weight = THTensor_(newContiguous)(weight); + +//#pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000) + for (i=0; i<nnz; i++) { + hp0 = (long)(THNN_(get2d)(input, i, 0)) - 1; + hp1 = (i+1 == nnz) ? + batchSize : + (long)(THNN_(get2d)(input, i+1, 0)) - 1; + if (hp0 != hp1) for (h = hp0; h < hp1; h++) { + THLongTensor_set1d(csr, h+1, i+1); + } + } + + + // output = weight * input + bias + THTensor_(zero)(output); +#pragma omp parallel for private(h, i) schedule(static) if (nnz > 10000) + for (h = 0; h < batchSize; h++) { + long i_start = THLongTensor_get1d(csr, h); + long i_end = THLongTensor_get1d(csr, h+1); + for (i = i_start; i < i_end; i++) { + real val = THNN_(get2d)(input, i, 2); + if (val == 0) { + continue; + } + + long offset = (long)(THNN_(get2d)(input, i, 1)) - 1; + if (offset >= 0 && offset < inDim) { + THBlas_(axpy)(outDim, + val, + COL_PTR2(weight, offset), weight->stride[0], + ROW_PTR2(output, h), output->stride[1]); + } else { + THError("index out of bound. updateOutput: %d not between 1 and %d", + offset + 1, inDim); + } + } + } + + THTensor* output_row = THTensor_(new)(); + for (h = 0; h < batchSize; h++) { + THTensor_(select)(output_row, output, 0, h); + THTensor_(cadd)(output_row, bias, 1.0, output_row); + } + THTensor_(free)(output_row); + THLongTensor_free(csr); + THTensor_(free)(weight); +} + +void THNN_(SparseLinear_legacyUpdateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias) +{ + long h, i; + long outDim = THTensor_(size)(weight, 0); + long inDim = THTensor_(size)(weight, 1); + + THArgCheck(THNN_(checkLegacyInput)(input), 2, "input size must be batchsize x nnz x 2"); + THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous"); + THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong"); + + weight = THTensor_(newContiguous)(weight); + + long batchSize = THTensor_(size)(input, 0); + long nnz = THTensor_(size)(input, 1); + THTensor_(resize2d)(output, batchSize, outDim); + + // output = weight * input + bias + THTensor_(zero)(output); +#pragma omp parallel for private(h, i) schedule(static) if ( \ + batchSize > 1 && batchSize * nnz * outDim > 10000) + for (h = 0; h < batchSize; h++) { + for (i = 0; i < nnz; i++) { + real val = THNN_(get3d)(input, h, i, 1); + if (val == 0) { + continue; + } + + long offset = (long)(THNN_(get3d)(input, h, i, 0)) - 1; + if (offset >= 0 && offset < inDim) { + THBlas_(axpy)(outDim, + val, + COL_PTR2(weight, offset), weight->stride[0], + ROW_PTR2(output, h), output->stride[1]); + } else { + THError("index out of bound. updateOutput: %d not between 1 and %d", + offset + 1, inDim); + } + } + } + + THTensor* output_row = THTensor_(new)(); + for (h = 0; h < batchSize; h++) { + THTensor_(select)(output_row, output, 0, h); + THTensor_(cadd)(output_row, bias, 1.0, output_row); + } + THTensor_(free)(output_row); + THTensor_(free)(weight); +} + +void THNN_(SparseLinear_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *weight, + THTensor *bias, + accreal weightDecay_, + accreal scale_) +{ + real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_); + real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); + long h, i, col, hp0, hp1; + long outDim = THTensor_(size)(weight, 0); + long inDim = THTensor_(size)(weight, 1); + + THArgCheck(THNN_(checkInput)(input), 2, + "input must be in coo format, nnz x 3"); + THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4, + "gradWeight size wrong"); + THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, + "gradBias size wrong"); + THArgCheck(THTensor_(isContiguous)(gradOutput), 1, + "gradOutput must be contiguous"); + + long nnz = THTensor_(size)(input, 0); + + THLongTensor* csc = THLongTensor_newWithSize1d(inDim+1); + THLongTensor_zero(csc); + weight = THTensor_(newContiguous)(weight); + +#pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000) + for (i = 0; i < nnz; i++) { + hp0 = (long)(THNN_(get2d)(input, i, 1)) - 1; + hp1 = (i+1 == nnz) ? + inDim : + (long)(THNN_(get2d)(input, i+1, 1)) - 1; + if (hp0 != hp1) for (h = hp0; h < hp1; h++) { + THLongTensor_set1d(csc, h+1, i+1); + } + } + + // gradWeight += gradOutput * input +#pragma omp parallel for private(h, i, col) schedule(static) if (nnz > 10000) + for (col = 0; col < inDim; col++) { + long i_start = THLongTensor_get1d(csc, col); + long i_end = THLongTensor_get1d(csc, col+1); + for (i = i_start; i < i_end; i++) { + real val = scale * THNN_(get2d)(input, i, 2); + + h = (long)(THNN_(get2d)(input, i, 0)) - 1; + long offset = (long)(THNN_(get2d)(input, i, 1)) - 1; + if (offset >= 0 && offset < inDim) { + THBlas_(axpy)(outDim, + val, + ROW_PTR2(gradOutput, h), gradOutput->stride[1], + COL_PTR2(gradWeight, offset), gradWeight->stride[0]); + } else { + THError( + "index out of bound. accGradParameters: %d not between 1 and %d", + offset + 1, + inDim); + } + } + } + + // gradBias += gradOutput + THTensor* buf = THTensor_(new)(); + THTensor_(sum)(buf, gradOutput, 0, 1); + THTensor_(cadd)(gradBias, gradBias, scale, buf); + THTensor_(free)(buf); + THLongTensor_free(csc); + + if (weightDecay != 0) { + THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight); + } + THTensor_(free)(weight); +} + +void THNN_(SparseLinear_legacyAccGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *weight, + THTensor *bias, + accreal weightDecay_, + accreal scale_) +{ + real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_); + real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); + long h, i; + long outDim = THTensor_(size)(weight, 0); + long inDim = THTensor_(size)(weight, 1); + + THArgCheck(THNN_(checkLegacyInput)(input), 2, + "input size must be batchsize x nnz x 2"); + THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4, + "gradWeight size wrong"); + THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, + "gradBias size wrong"); + THArgCheck(THTensor_(isContiguous)(gradOutput), 1, + "gradOutput must be contiguous"); + + long batchSize = THTensor_(size)(input, 0); + long nnz = THTensor_(size)(input, 1); + THTensor_(resize2d)(gradOutput, batchSize, outDim); + + // gradWeight += gradOutput * input +#pragma omp parallel for private(h, i) schedule(static) if (\ + batchSize * nnz * outDim > 10000) + for (i = 0; i < nnz; i++) { + for (h = 0; h < batchSize; h++) { + real val = scale * THNN_(get3d)(input, h, i, 1); + if (val == 0) { + continue; + } + + long offset = (long)(THNN_(get3d)(input, h, i, 0)) - 1; + if (offset >= 0 && offset < inDim) { + THBlas_(axpy)(outDim, + val, + ROW_PTR2(gradOutput, h), gradOutput->stride[1], + COL_PTR2(gradWeight, offset), gradWeight->stride[0]); + } else { + THError( + "index out of bound. accGradParameters: %d not between 1 and %d", + offset + 1, + inDim); + } + } + } + + // gradBias += gradOutput + THTensor* gradOutput_row = THTensor_(new)(); + for (h = 0; h < batchSize; h++) { + THTensor_(select)(gradOutput_row, gradOutput, 0, h); + THTensor_(cadd)(gradBias, gradBias, scale, gradOutput_row); + } + THTensor_(free)(gradOutput_row); + + if (weightDecay != 0) { + THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight); + } +} + +void THNN_(SparseLinear_updateParameters)( + THNNState *state, + THTensor *weight, + THTensor *bias, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *lastInput, + accreal learningRate_) +{ + real learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_); + long h, i; + long outDim = weight->size[0]; + long inDim = weight->size[1]; + + THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4, + "gradWeight size wrong"); + THArgCheck(THNN_(checkSize1D)(bias, outDim), 3, "bias size wrong"); + THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong"); + THArgCheck(THNN_(checkInput)(lastInput), 6, + "input must be in coo format, nnz x 3"); + + + long nnz = THTensor_(size)(lastInput, 0); + + // collect unique offsets of non-0 val in input + THTensor* offsets = THTensor_(newWithSize1d)(nnz); + long cnt = 0; + for (i = 0; i < nnz; i++) { + real val = THNN_(get2d)(lastInput, i, 2); + if (val == 0) { + continue; + } + long offset = (long)(THNN_(get2d)(lastInput, i, 1)) - 1; + if (offset >= 0 && offset < inDim) { + THNN_(set1d)(offsets, cnt++, offset); + } else { + THError( + "index out of bound. updateParameters: %d not between 1 and %d", + offset + 1, + inDim); + } + } + if (cnt == 0) return; + THTensor_(resize1d)(offsets, cnt); + + THTensor* uniqueOffsets = THTensor_(new)(); + THLongTensor* ri = THLongTensor_new(); + THTensor_(sort)(uniqueOffsets, ri, offsets, 0, 0); + THLongTensor_free(ri); + THTensor_(free)(offsets); + + cnt = 1; + real* uniqueOffsets_p = THTensor_(data)(uniqueOffsets); + for (i = 1; i < THTensor_(size)(uniqueOffsets, 0); i++) { + if (uniqueOffsets_p[i] != uniqueOffsets_p[i - 1]) { + uniqueOffsets_p[cnt++] = uniqueOffsets_p[i]; + } + } + THTensor_(resize1d)(uniqueOffsets, cnt); + + // weight += -learningRate * gradWeight + THTensor_(cadd)(bias, bias, -learningRate, gradBias); +#pragma omp parallel for private(i) schedule(static) if (cnt * outDim > 10000) + for (i = 0; i < cnt; i++) { + long offset = (long)uniqueOffsets_p[i]; + THBlas_(axpy)(outDim, + -learningRate, + COL_PTR2(gradWeight, offset), gradWeight->stride[0], + COL_PTR2(weight, offset), weight->stride[0]); + } + + THTensor_(free)(uniqueOffsets); +} + +void THNN_(SparseLinear_legacyUpdateParameters)( + THNNState *state, + THTensor *weight, + THTensor *bias, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *lastInput, + accreal learningRate_) +{ + real learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_); + long h, i; + long outDim = weight->size[0]; + long inDim = weight->size[1]; + + THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4, + "gradWeight size wrong"); + THArgCheck(THNN_(checkSize1D)(bias, outDim), 3, "bias size wrong"); + THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong"); + THArgCheck(THNN_(checkLegacyInput)(lastInput), 6, + "input size must be batchsize x nnz x 2"); + + + long batchSize = THTensor_(size)(lastInput, 0); + long nnz = THTensor_(size)(lastInput, 1); + + // collect unique offsets of non-0 val in input + THTensor* offsets = THTensor_(newWithSize1d)(batchSize * nnz); + long cnt = 0; + for (h = 0; h < batchSize; h++) { + for (i = 0; i < nnz; i++) { + real val = THNN_(get3d)(lastInput, h, i, 1); + if (val == 0 ) { + continue; + } + long offset = (long)(THNN_(get3d)(lastInput, h, i, 0)) - 1; + if (offset >= 0 && offset < inDim) { + THNN_(set1d)(offsets, cnt++, offset); + } else { + THError( + "index out of bound. updateParameters: %d not between 1 and %d", + offset + 1, + inDim); + } + } + } + THTensor_(resize1d)(offsets, cnt); + + THTensor* uniqueOffsets = THTensor_(new)(); + THLongTensor* ri = THLongTensor_new(); + THTensor_(sort)(uniqueOffsets, ri, offsets, 0, 0); + THLongTensor_free(ri); + THTensor_(free)(offsets); + + cnt = 1; + real* uniqueOffsets_p = THTensor_(data)(uniqueOffsets); + for (i = 1; i < THTensor_(size)(uniqueOffsets, 0); i++) { + if (uniqueOffsets_p[i] != uniqueOffsets_p[i - 1]) { + uniqueOffsets_p[cnt++] = uniqueOffsets_p[i]; + } + } + THTensor_(resize1d)(uniqueOffsets, cnt); + + // weight += -learningRate * gradWeight + THTensor_(cadd)(bias, bias, -learningRate, gradBias); +#pragma omp parallel for private(i) schedule(static) if (cnt * outDim > 10000) + for (i = 0; i < cnt; i++) { + long offset = (long)uniqueOffsets_p[i]; + THBlas_(axpy)(outDim, + -learningRate, + COL_PTR2(gradWeight, offset), gradWeight->stride[0], + COL_PTR2(weight, offset), weight->stride[0]); + } + + THTensor_(free)(uniqueOffsets); +} + +void THNN_(SparseLinear_zeroGradParameters)( + THNNState *state, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *lastInput) +{ + long h, i, j; + + long outDim = gradWeight->size[0]; + long inDim = gradWeight->size[1]; + + THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 3, "gradBias size wrong"); + THArgCheck(THNN_(checkInput)(lastInput), 4, + "input must be in coo format, nnz x 3"); + + THTensor_(zero)(gradBias); + + long nnz = THTensor_(size)(lastInput, 0); + +#pragma omp parallel for private(i, j) schedule(static) if ( \ + nnz * outDim > 10000) + for (i = 0; i < nnz; i++) { + if (THNN_(get2d)(lastInput, i, 2) == 0 ) { + continue; + } + + long offset = (long)(THNN_(get2d)(lastInput, i, 1)) - 1; + if (offset >= 0 && offset < inDim) { + real* pGradWeight = COL_PTR2(gradWeight, offset); + if (gradWeight->stride[0] == 1) { + THVector_(fill)(pGradWeight, 0, outDim); + } else { + long stride = gradWeight->stride[0]; + for (j = 0; j < outDim; ++j) { + pGradWeight[j * stride] = 0; + } + } + } else { + THError( + "index out of bound. zeroGradParameters: %d not between 1 and %d", + offset + 1, + inDim); + } + } +} + +void THNN_(SparseLinear_legacyZeroGradParameters)( + THNNState *state, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *lastInput) +{ + long h, i, j; + + long outDim = gradWeight->size[0]; + long inDim = gradWeight->size[1]; + + THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 3, "gradBias size wrong"); + THArgCheck(THNN_(checkLegacyInput)(lastInput), 4, + "input size must be batchsize x nnz x 2"); + + THTensor_(zero)(gradBias); + + long batchSize = THTensor_(size)(lastInput, 0); + long nnz = THTensor_(size)(lastInput, 1); + +#pragma omp parallel for private(h, i, j) schedule(static) if ( \ + batchSize > 1 && batchSize * nnz * outDim > 10000) + for (h = 0; h < batchSize; h++) { + for (i = 0; i < nnz; i++) { + if (THNN_(get3d)(lastInput, h, i, 1) == 0 ) { + continue; + } + + long offset = (long)(THNN_(get3d)(lastInput, h, i, 0)) - 1; + if (offset >= 0 && offset < inDim) { + real* pGradWeight = COL_PTR2(gradWeight, offset); + if (gradWeight->stride[0] == 1) { + THVector_(fill)(pGradWeight, 0, outDim); + } else { + long stride = gradWeight->stride[0]; + for (j = 0; j < outDim; ++j) { + pGradWeight[j * stride] = 0; + } + } + } else { + THError( + "index out of bound. zeroGradParameters: %d not between 1 and %d", + offset + 1, + inDim); + } + } + } +} + +#undef ROW_PTR2 +#undef COL_PTR2 + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialAdaptiveAveragePooling.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialAdaptiveAveragePooling.c new file mode 100644 index 000000000..3675b42d7 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialAdaptiveAveragePooling.c @@ -0,0 +1,258 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SpatialAdaptiveAveragePooling.c" +#else + +#define START_IND(a,b,c) (int)floor((float)(a * c) / b) +#define END_IND(a,b,c) (int)ceil((float)((a + 1) * c) / b) +// #define START_IND(a,b,c) a * c / b +// #define END_IND(a,b,c) (a + 1) * c / b + ((a + 1) * c % b > 0)?1:0 + +static void THNN_(SpatialAdaptiveAveragePooling_updateOutput_frame)( + real *input_p, + real *output_p, + long nslices, + long iwidth, + long iheight, + long owidth, + long oheight, + long stridew, + long strideh, + long strided) +{ + long k; +#pragma omp parallel for private(k) + for (k = 0; k < nslices; k++) + { + /* loop over output */ + long i, j; + for(i = 0; i < oheight; i++) + { + int y_start = START_IND(i, oheight, iheight); + int y_end = END_IND(i, oheight, iheight); + int kH = y_end-y_start; + + for(j = 0; j < owidth; j++) + { + + int x_start = START_IND(j, owidth, iwidth); + int x_end = END_IND(j, owidth, iwidth); + int kW = x_end-x_start; + + /* local pointers */ + real *ip = input_p + k*strided + y_start*strideh + x_start*stridew; + real *op = output_p + k*owidth*oheight + i*owidth + j; + + /* compute local average: */ + real sum = 0; + int x,y; + for(y = 0; y < kH; y++) + { + for(x = 0; x < kW; x++) + { + real val = *(ip + y*strideh + x*stridew); + sum += val; + } + } + + /* set output to local average */ + *op = sum / kW / kH; + } + } + } +} + +void THNN_(SpatialAdaptiveAveragePooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int owidth, + int oheight) +{ + int dimw = 2; + int dimh = 1; + long nbatch = 1; + long nslices; + long iheight; + long iwidth; + + long istride_d; + long istride_h; + long istride_w; + long istride_b; + + real *input_data; + real *output_data; + + + THNN_ARGCHECK(input->nDimension == 3 || input->nDimension == 4, 2, input, + "3D or 4D (batch mode) tensor expected for input, but got: %s"); + + if (input->nDimension == 4) + { + istride_b = input->stride[0]; + nbatch = input->size[0]; + dimw++; + dimh++; + } + + /* sizes */ + nslices = input->size[dimh-1]; + iheight = input->size[dimh]; + iwidth = input->size[dimw]; + /* strides */ + istride_d = input->stride[dimh-1]; + istride_h = input->stride[dimh]; + istride_w = input->stride[dimw]; + + /* resize output */ + if (input->nDimension == 3) + { + THTensor_(resize3d)(output, nslices, oheight, owidth); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + + THNN_(SpatialAdaptiveAveragePooling_updateOutput_frame)(input_data, output_data, + nslices, + iwidth, iheight, + owidth, oheight, + istride_w,istride_h, + istride_d); + } + else + { + long p; + + THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + +#pragma omp parallel for private(p) + for (p = 0; p < nbatch; p++) + { + THNN_(SpatialAdaptiveAveragePooling_updateOutput_frame)(input_data+p*istride_b, output_data+p*nslices*owidth*oheight, + nslices, + iwidth, iheight, + owidth, oheight, + istride_w,istride_h, + istride_d); + } + } +} + +static void THNN_(SpatialAdaptiveAveragePooling_updateGradInput_frame)( + real *gradInput_p, + real *gradOutput_p, + long nslices, + long iwidth, + long iheight, + long owidth, + long oheight) +{ + long k; +#pragma omp parallel for private(k) + for (k = 0; k < nslices; k++) + { + real *gradInput_p_k = gradInput_p + k*iwidth*iheight; + real *gradOutput_p_k = gradOutput_p + k*owidth*oheight; + + /* calculate average */ + long i, j; + for(i = 0; i < oheight; i++) + { + int y_start = START_IND(i, oheight, iheight); + int y_end = END_IND(i, oheight, iheight); + int kH = y_end-y_start; + + for(j = 0; j < owidth; j++) + { + + int x_start = START_IND(j, owidth, iwidth); + int x_end = END_IND(j, owidth, iwidth); + int kW = x_end-x_start; + + int x,y; + for(y = y_start; y < y_end; y++) + { + for(x = x_start; x < x_end; x++) + { + /* update gradient */ + gradInput_p_k[y*iwidth + x] += gradOutput_p_k[i*owidth + j] / kW / kH; + } + } + } + } + } +} + +void THNN_(SpatialAdaptiveAveragePooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput) +{ + int dimw = 2; + int dimh = 1; + long nbatch = 1; + int nslices; + int iheight; + int iwidth; + int oheight; + int owidth; + real *gradInput_data; + real *gradOutput_data; + + /* get contiguous gradOutput */ + gradOutput = THTensor_(newContiguous)(gradOutput); + + /* resize */ + THTensor_(resizeAs)(gradInput, input); + THTensor_(zero)(gradInput); + + if (input->nDimension == 4) { + nbatch = input->size[0]; + dimw++; + dimh++; + } + + /* sizes */ + nslices = input->size[dimh-1]; + iheight = input->size[dimh]; + iwidth = input->size[dimw]; + oheight = gradOutput->size[dimh]; + owidth = gradOutput->size[dimw]; + + /* get raw pointers */ + gradInput_data = THTensor_(data)(gradInput); + gradOutput_data = THTensor_(data)(gradOutput); + + /* backprop */ + if (input->nDimension == 3) + { + THNN_(SpatialAdaptiveAveragePooling_updateGradInput_frame)(gradInput_data, gradOutput_data, + nslices, + iwidth, iheight, + owidth, oheight); + } + else + { + long p; +#pragma omp parallel for private(p) + for (p = 0; p < nbatch; p++) + { + THNN_(SpatialAdaptiveAveragePooling_updateGradInput_frame)(gradInput_data+p*nslices*iwidth*iheight, gradOutput_data+p*nslices*owidth*oheight, + nslices, + iwidth, iheight, + owidth, oheight); + } + } + + /* cleanup */ + THTensor_(free)(gradOutput); +} + +#endif + +#undef START_IND +#undef END_IND
\ No newline at end of file diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialAdaptiveMaxPooling.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialAdaptiveMaxPooling.c new file mode 100644 index 000000000..fff716e67 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialAdaptiveMaxPooling.c @@ -0,0 +1,274 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SpatialAdaptiveMaxPooling.c" +#else + +static void THNN_(SpatialAdaptiveMaxPooling_updateOutput_frame)( + real *input_p, + real *output_p, + THIndex_t *indx_p, + THIndex_t *indy_p, + long nslices, + long iwidth, + long iheight, + long owidth, + long oheight, + long stridew, + long strideh, + long strided) +{ + long k; +#pragma omp parallel for private(k) + for (k = 0; k < nslices; k++) + { + /* loop over output */ + long i, j; + for(i = 0; i < oheight; i++) + { + int y_start = (int)floor((float)i / oheight * iheight); + int y_end = (int)ceil((float)(i + 1) / oheight * iheight); + int kH = y_end-y_start; + + for(j = 0; j < owidth; j++) + { + + int x_start = (int)floor((float)j / owidth * iwidth); + int x_end = (int)ceil((float)(j + 1) / owidth * iwidth); + int kW = x_end-x_start; + + /* local pointers */ + real *ip = input_p + k*strided + y_start*strideh + x_start*stridew; + real *op = output_p + k*owidth*oheight + i*owidth + j; + THIndex_t *indyp = indy_p + k*owidth*oheight + i*owidth + j; + THIndex_t *indxp = indx_p + k*owidth*oheight + i*owidth + j; + + /* compute local max: */ + long maxindex = -1; + real maxval = -FLT_MAX; + long tcntr = 0; + int x,y; + for(y = 0; y < kH; y++) + { + for(x = 0; x < kW; x++) + { + real val = *(ip + y*strideh + x*stridew); + if (val > maxval) + { + maxval = val; + maxindex = tcntr; + } + tcntr++; + } + } + + /* set output to local max */ + *op = maxval; + + /* store location of max (x,y) */ + *indyp = (maxindex / kW) + TH_INDEX_BASE; + *indxp = (maxindex % kW) + TH_INDEX_BASE; + } + } + } +} + +void THNN_(SpatialAdaptiveMaxPooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THIndexTensor *indices, + int owidth, + int oheight) +{ + int dimw = 2; + int dimh = 1; + long nbatch = 1; + long nslices; + long iheight; + long iwidth; + + long istride_d; + long istride_h; + long istride_w; + long istride_b; + + real *input_data; + real *output_data; + THIndex_t *indices_data; + + + THNN_ARGCHECK(input->nDimension == 3 || input->nDimension == 4, 2, input, + "3D or 4D (batch mode) tensor expected for input, but got: %s"); + + if (input->nDimension == 4) + { + istride_b = input->stride[0]; + nbatch = input->size[0]; + dimw++; + dimh++; + } + + /* sizes */ + nslices = input->size[dimh-1]; + iheight = input->size[dimh]; + iwidth = input->size[dimw]; + /* strides */ + istride_d = input->stride[dimh-1]; + istride_h = input->stride[dimh]; + istride_w = input->stride[dimw]; + + /* resize output */ + if (input->nDimension == 3) + { + THTensor_(resize3d)(output, nslices, oheight, owidth); + /* indices will contain i,j locations for each output point */ + THIndexTensor_(resize4d)(indices, 2, nslices, oheight, owidth); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + indices_data = THIndexTensor_(data)(indices); + + THNN_(SpatialAdaptiveMaxPooling_updateOutput_frame)(input_data, output_data, + indices_data+nslices*owidth*oheight, indices_data, + nslices, + iwidth, iheight, + owidth, oheight, + istride_w,istride_h, + istride_d); + } + else + { + long p; + + THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth); + /* indices will contain i,j locations for each output point */ + THIndexTensor_(resize5d)(indices, 2, nbatch, nslices, oheight, owidth); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + indices_data = THIndexTensor_(data)(indices); + +#pragma omp parallel for private(p) + for (p = 0; p < nbatch; p++) + { + THNN_(SpatialAdaptiveMaxPooling_updateOutput_frame)(input_data+p*istride_b, output_data+p*nslices*owidth*oheight, + indices_data+(p+nbatch)*nslices*owidth*oheight, indices_data+p*nslices*owidth*oheight, + nslices, + iwidth, iheight, + owidth, oheight, + istride_w,istride_h, + istride_d); + } + } +} + +static void THNN_(SpatialAdaptiveMaxPooling_updateGradInput_frame)( + real *gradInput_p, + real *gradOutput_p, + THIndex_t *indx_p, + THIndex_t *indy_p, + long nslices, + long iwidth, + long iheight, + long owidth, + long oheight) +{ + long k; +#pragma omp parallel for private(k) + for (k = 0; k < nslices; k++) + { + real *gradInput_p_k = gradInput_p + k*iwidth*iheight; + real *gradOutput_p_k = gradOutput_p + k*owidth*oheight; + THIndex_t *indx_p_k = indx_p + k*owidth*oheight; + THIndex_t *indy_p_k = indy_p + k*owidth*oheight; + + /* calculate max points */ + long i, j; + for(i = 0; i < oheight; i++) + { + int y_start = (int)floor((float) i / oheight * iheight); + for(j = 0; j < owidth; j++) + { + int x_start = (int)floor((float) j / owidth * iwidth); + /* retrieve position of max */ + long maxi = indy_p_k[i*owidth + j] - TH_INDEX_BASE + y_start; + long maxj = indx_p_k[i*owidth + j] - TH_INDEX_BASE + x_start; + + /* update gradient */ + gradInput_p_k[maxi*iwidth + maxj] += gradOutput_p_k[i*owidth + j]; + } + } + } +} + +void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THIndexTensor *indices) +{ + int dimw = 2; + int dimh = 1; + long nbatch = 1; + int nslices; + int iheight; + int iwidth; + int oheight; + int owidth; + real *gradInput_data; + real *gradOutput_data; + THIndex_t *indices_data; + + /* get contiguous gradOutput */ + gradOutput = THTensor_(newContiguous)(gradOutput); + + /* resize */ + THTensor_(resizeAs)(gradInput, input); + THTensor_(zero)(gradInput); + + if (input->nDimension == 4) { + nbatch = input->size[0]; + dimw++; + dimh++; + } + + /* sizes */ + nslices = input->size[dimh-1]; + iheight = input->size[dimh]; + iwidth = input->size[dimw]; + oheight = gradOutput->size[dimh]; + owidth = gradOutput->size[dimw]; + + /* get raw pointers */ + gradInput_data = THTensor_(data)(gradInput); + gradOutput_data = THTensor_(data)(gradOutput); + indices_data = THIndexTensor_(data)(indices); + + /* backprop */ + if (input->nDimension == 3) + { + THNN_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(gradInput_data, gradOutput_data, + indices_data+nslices*owidth*oheight, indices_data, + nslices, + iwidth, iheight, + owidth, oheight); + } + else + { + long p; +#pragma omp parallel for private(p) + for (p = 0; p < nbatch; p++) + { + THNN_(SpatialAdaptiveMaxPooling_updateGradInput_frame)(gradInput_data+p*nslices*iwidth*iheight, gradOutput_data+p*nslices*owidth*oheight, + indices_data+(p+nbatch)*nslices*owidth*oheight, indices_data+p*nslices*owidth*oheight, + nslices, + iwidth, iheight, + owidth, oheight); + } + } + + /* cleanup */ + THTensor_(free)(gradOutput); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialAveragePooling.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialAveragePooling.c new file mode 100644 index 000000000..c063502e7 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialAveragePooling.c @@ -0,0 +1,329 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SpatialAveragePooling.c" +#else + +static inline void THNN_(SpatialAveragePooling_shapeCheck)( + THTensor *input, THTensor *gradOutput, + int kH, int kW, int dH, int dW, int padH, int padW, + bool ceil_mode) { + + THArgCheck(kW > 0 && kH > 0, 5, + "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW); + THArgCheck(dW > 0 && dH > 0, 8, + "stride should be greater than zero, but got dH: %d dW: %d", dH, dW); + + int ndim = input->nDimension; + int dimf = 0; + int dimh = 1; + int dimw = 2; + + if (ndim == 4) { + dimf++; + dimh++; + dimw++; + } + + THNN_ARGCHECK(ndim == 3 || ndim == 4, 2, input, + "3D or 4D input tensor expected but got: %s"); + + THArgCheck(kW/2 >= padW && kH/2 >= padH, 2, + "pad should be smaller than half of kernel size, but got " + "padW = %d, padH = %d, kW = %d, kH = %d", + padW, padH, kW, kH); + + long nInputPlane = input->size[dimh-1]; + long inputHeight = input->size[dimh]; + long inputWidth = input->size[dimw]; + long outputHeight, outputWidth; + long nOutputPlane = nInputPlane; + + if(ceil_mode) + { + outputHeight = (long)(ceil((float)(inputHeight - kH + 2*padH) / dH)) + 1; + outputWidth = (long)(ceil((float)(inputWidth - kW + 2*padW) / dW)) + 1; + } + else + { + outputHeight = (long)(floor((float)(inputHeight - kH + 2*padH) / dH)) + 1; + outputWidth = (long)(floor((float)(inputWidth - kW + 2*padW) / dW)) + 1; + } + + if (padW || padH) + { + // ensure that the last pooling starts inside the image + // needed to avoid problems in ceil mode + if ((outputHeight - 1)*dH >= inputHeight + padH) + --outputHeight; + if ((outputWidth - 1)*dW >= inputWidth + padW) + --outputWidth; + } + + if (outputWidth < 1 || outputHeight < 1) + THError("Given input size: (%dx%dx%d). " + "Calculated output size: (%dx%dx%d). Output size is too small", + nInputPlane,inputHeight,inputWidth,nInputPlane,outputHeight,outputWidth); + + if (gradOutput != NULL) { + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth); + } +} + +void THNN_(SpatialAveragePooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH, + bool ceil_mode, + bool count_include_pad) +{ + real *output_data; + real *input_data; + + int dimw = 2; + int dimh = 1; + int dimc = 0; + long nbatch = 1; + + long inputWidth; + long inputHeight; + long outputWidth; + long outputHeight; + long nInputPlane; // number of channels (or colors) + + long k; + + THNN_(SpatialAveragePooling_shapeCheck) + (input, NULL, kH, kW, dH, dW, padH, padW, ceil_mode); + + if (input->nDimension == 4) { + nbatch = input->size[0]; + dimw++; + dimh++; + dimc++; + } + + inputWidth = input->size[dimw]; + inputHeight = input->size[dimh]; + nInputPlane = input->size[dimc]; + + if(ceil_mode) + { + outputWidth = (long)(ceil((float)(inputWidth - kW + 2*padW) / dW)) + 1; + outputHeight = (long)(ceil((float)(inputHeight - kH + 2*padH) / dH)) + 1; + } + else + { + outputWidth = (long)(floor((float)(inputWidth - kW + 2*padW) / dW)) + 1; + outputHeight = (long)(floor((float)(inputHeight - kH + 2*padH) / dH)) + 1; + } + if (padW || padH) + { + // ensure that the last pooling starts inside the image + // needed to avoid problems in ceil mode + if ((outputHeight - 1)*dH >= inputHeight + padH) + --outputHeight; + if ((outputWidth - 1)*dW >= inputWidth + padW) + --outputWidth; + } + + if (input->nDimension == 3) + THTensor_(resize3d)(output, nInputPlane, outputHeight, outputWidth); + else + THTensor_(resize4d)(output, input->size[0], nInputPlane, outputHeight, outputWidth); + + input = THTensor_(newContiguous)(input); + THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous"); + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + +#pragma omp parallel for private(k) + for(k = 0; k < nInputPlane; k++) + { + long p; + for(p = 0; p < nbatch; p++) + { + long xx, yy; + /* For all output pixels... */ + real *ptr_output = output_data + p*nInputPlane*outputWidth*outputHeight + k*outputWidth*outputHeight; + real *ptr_input = input_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight; + long i; + for(i = 0; i < outputWidth*outputHeight; i++) + ptr_output[i] = 0; + + for(yy = 0; yy < outputHeight; yy++) + { + for(xx = 0; xx < outputWidth; xx++) + { + /* Compute the mean of the input image... */ + long hstart = yy * dH - padH; + long wstart = xx * dW - padW; + long hend = fminf(hstart + kH, inputHeight + padH); + long wend = fminf(wstart + kW, inputWidth + padW); + int pool_size = (hend - hstart) * (wend - wstart); + hstart = fmaxf(hstart, 0); + wstart = fmaxf(wstart, 0); + hend = fminf(hend, inputHeight); + wend = fminf(wend, inputWidth); + + real sum = 0; + + int divide_factor; + if(count_include_pad) + divide_factor = pool_size; + else + divide_factor = (hend - hstart) * (wend - wstart); + + long kx, ky; + + for(ky = hstart; ky < hend; ky++) + { + for(kx = wstart; kx < wend; kx++) + sum += ptr_input[ky*inputWidth + kx]; + } + /* Update output */ + *ptr_output++ += sum/divide_factor; + } + } + } + } + THTensor_(free)(input); +} + +void THNN_(SpatialAveragePooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH, + bool ceil_mode, + bool count_include_pad) +{ + int dimw = 2; + int dimh = 1; + int dimc = 0; + long nbatch = 1; + long ndim = 3; + + long inputWidth; + long inputHeight; + long outputWidth; + long outputHeight; + long nInputPlane; // number of channels (or colors) + + real *gradOutput_data; + real *input_data, *gradInput_data; + + long k; + + THNN_(SpatialAveragePooling_shapeCheck) + (input, gradOutput, kH, kW, dH, dW, padH, padW, ceil_mode); + + + if (input->nDimension == 4) { + nbatch = input->size[0]; + dimw++; + dimh++; + dimc++; + ndim = 4; + } + + inputWidth = input->size[dimw]; + inputHeight = input->size[dimh]; + nInputPlane = input->size[dimc]; + + if(ceil_mode) + { + outputWidth = (long)(ceil((float)(inputWidth - kW + 2*padW) / dW)) + 1; + outputHeight = (long)(ceil((float)(inputHeight - kH + 2*padH) / dH)) + 1; + } + else + { + outputWidth = (long)(floor((float)(inputWidth - kW + 2*padW) / dW)) + 1; + outputHeight = (long)(floor((float)(inputHeight - kH + 2*padH) / dH)) + 1; + } + if (padW || padH) + { + // ensure that the last pooling starts inside the image + // needed to avoid problems in ceil mode + if ((outputHeight - 1)*dH >= inputHeight + padH) + --outputHeight; + if ((outputWidth - 1)*dW >= inputWidth + padW) + --outputWidth; + } + + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth); + + THTensor_(resizeAs)(gradInput, input); + + gradOutput = THTensor_(newContiguous)(gradOutput); + THArgCheck(THTensor_(isContiguous)(gradInput), 4, "gradInput must be contiguous"); + + gradInput_data = THTensor_(data)(gradInput); + gradOutput_data = THTensor_(data)(gradOutput); + +#pragma omp parallel for private(k) + for(k = 0; k < nInputPlane; k++) + { + long p; + for(p = 0; p < nbatch; p++) + { + real *ptr_gradOutput = gradOutput_data + p*nInputPlane*outputHeight*outputWidth + k*outputWidth*outputHeight; + long xx, yy; + + real* ptr_gi = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight; + real *ptr_gradInput = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight; + + long i; + for(i=0; i<inputWidth*inputHeight; i++) + ptr_gi[i] = 0.0; + + for(yy = 0; yy < outputHeight; yy++) + { + for(xx = 0; xx < outputWidth; xx++) + { + long hstart = yy * dH - padH; + long wstart = xx * dW - padW; + long hend = fminf(hstart + kH, inputHeight + padH); + long wend = fminf(wstart + kW, inputWidth + padW); + int pool_size = (hend - hstart) * (wend - wstart); + hstart = fmaxf(hstart, 0); + wstart = fmaxf(wstart, 0); + hend = fminf(hend, inputHeight); + wend = fminf(wend, inputWidth); + + real z = *ptr_gradOutput++; + + int divide_factor; + if(count_include_pad) + divide_factor = pool_size; + else + divide_factor = (hend - hstart) * (wend - wstart); + + long kx, ky; + for(ky = hstart ; ky < hend; ky++) + { + for(kx = wstart; kx < wend; kx++) + ptr_gradInput[ky*inputWidth + kx] += z/divide_factor; + } + } + } + } + } + + THTensor_(free)(gradOutput); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialClassNLLCriterion.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialClassNLLCriterion.c new file mode 100644 index 000000000..d711c8590 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialClassNLLCriterion.c @@ -0,0 +1,131 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SpatialClassNLLCriterion.c" +#else + +#define INITIAL_CHECK \ + THArgCheck(THIndexTensor_(nDimension)(target) == 3, 3, \ + "only batches of spatial targets supported (3D tensors)" \ + " but got targets of dimension: %d", \ + THIndexTensor_(nDimension)(target)); \ + THArgCheck(THTensor_(nDimension)(input) == 4, 2, \ + "only batches of spatial inputs supported (4D tensors), " \ + "but got input of dimension: %d", THTensor_(nDimension)(input)); \ + if (weights && THTensor_(nElement)(weights) != THTensor_(size)(input, 1)) { \ + THError("weight tensor should be defined either for all or no classes"); \ + } \ + \ + { \ + long input0 = THTensor_(size)(input, 0); \ + long input1 = THTensor_(size)(input, 1); \ + long input2 = THTensor_(size)(input, 2); \ + long input3 = THTensor_(size)(input, 3); \ + long target0 = THIndexTensor_(size)(target, 0); \ + long target1 = THIndexTensor_(size)(target, 1); \ + long target2 = THIndexTensor_(size)(target, 2); \ + THAssertMsg(input0 == target0 && input2 == target1 && input3 == target2, \ + "size mismatch (got input: %ldx%ldx%ldx%ld, target: %ldx%ldx%ld)", \ + input0, input1, input2, input3, target0, target1, target2); \ + } + +void THNN_(SpatialClassNLLCriterion_updateOutput)( + THNNState *state, + THTensor *input, + THIndexTensor *target, + THTensor *output, + bool sizeAverage, + THTensor *weights, + THTensor *total_weight) +{ + INITIAL_CHECK; + + input = THTensor_(newContiguous)(input); + target = THIndexTensor_(newContiguous)(target); + weights = weights ? THTensor_(newContiguous)(weights) : NULL; + + real *input_data = THTensor_(data)(input); + THIndex_t *target_data = THIndexTensor_(data)(target); + real *weights_data = weights ? THTensor_(data)(weights) : NULL; + real *output_data = THTensor_(data)(output); + real *total_weight_data = THTensor_(data)(total_weight); + + long batch_size = THTensor_(size)(input, 0); + long n_classes = THTensor_(size)(input, 1); + long map_size = THTensor_(size)(input, 2) * THTensor_(size)(input, 3); + long sample_size = map_size * n_classes; + + real total_weight_acc = 0; + real output_acc = 0; + for (int b = 0; b < batch_size; b++) { + for (int elem = 0; elem < map_size; elem++) { + int cur_target = target_data[b * map_size + elem] - TH_INDEX_BASE; + THAssert(cur_target >= 0 && cur_target < n_classes); + + real cur_weight = weights ? weights_data[cur_target] : 1.0f; + total_weight_acc += cur_weight; + output_acc -= input_data[b * sample_size + cur_target * map_size + elem] * cur_weight; + } + } + *total_weight_data = total_weight_acc; + *output_data = output_acc; + + if (sizeAverage && *total_weight_data) + *output_data /= *total_weight_data; + + THTensor_(free)(input); + THIndexTensor_(free)(target); + if (weights) + THTensor_(free)(weights); +} + +void THNN_(SpatialClassNLLCriterion_updateGradInput)( + THNNState *state, + THTensor *input, + THIndexTensor *target, + THTensor *gradInput, + bool sizeAverage, + THTensor *weights, + THTensor *total_weight) +{ + INITIAL_CHECK; + THArgCheck(THTensor_(isContiguous)(gradInput), 4, + "gradInput must be contiguous"); + + real *total_weight_data = THTensor_(data)(total_weight); + if (*total_weight_data <= 0) + return; + + target = THIndexTensor_(newContiguous)(target); + weights = weights ? THTensor_(newContiguous)(weights) : NULL; + + THIndex_t *target_data = THIndexTensor_(data)(target); + real *weights_data = weights ? THTensor_(data)(weights) : NULL; + real *gradInput_data = THTensor_(data)(gradInput); + + long batch_size = THTensor_(size)(input, 0); + long n_classes = THTensor_(size)(input, 1); + long map_size = THTensor_(size)(input, 2) * THTensor_(size)(input, 3); + long sample_size = map_size * n_classes; + + real normalize = sizeAverage ? *total_weight_data : 1.0f; + + int b; + #pragma omp parallel for + for (b = 0; b < batch_size; b++) { + int elem; + for (elem = 0; elem < map_size; elem++) { + int cur_target = target_data[b * map_size + elem] - TH_INDEX_BASE; + THAssert(cur_target >= 0 && cur_target < n_classes); + + gradInput_data[b * sample_size + cur_target * map_size + elem] = + -(weights ? weights_data[cur_target] : 1.0f) / normalize; + } + } + + THIndexTensor_(free)(target); + if (weights) + THTensor_(free)(weights); +} + +#undef INITIAL_CHECK + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialConvolutionLocal.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialConvolutionLocal.c new file mode 100644 index 000000000..6db5a5db9 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialConvolutionLocal.c @@ -0,0 +1,367 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SpatialConvolutionLocal.c" +#else + +static inline void THNN_(SpatialConvolutionLocal_shapeCheck)( + THTensor *input, THTensor *gradOutput, + THTensor *weight, THTensor *bias, + int kH, int kW, int dH, + int dW, int padH, int padW, + long inputHeight, long inputWidth, + long outputHeight, long outputWidth) { + + THArgCheck(kW > 0 && kH > 0, 9, + "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW); + THArgCheck(dW > 0 && dH > 0, 11, + "stride should be greater than zero, but got dH: %d dW: %d", dH, dW); + + int ndim = input->nDimension; + int dimf = 0; + int dimh = 1; + int dimw = 2; + + if (ndim == 4) { + dimf++; + dimh++; + dimw++; + } + + THNN_ARGCHECK(ndim == 3 || ndim == 4, 2, input, + "3D or 4D input tensor expected but got: %s"); + + long nInputPlane = weight->size[2] / (kH * kW); + long nOutputPlane = weight->size[1]; + + if (bias != NULL) { + THNN_CHECK_DIM_SIZE(bias, 3, 0, nOutputPlane); + THNN_CHECK_DIM_SIZE(bias, 3, 1, outputHeight); + THNN_CHECK_DIM_SIZE(bias, 3, 2, outputWidth); + } + + THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane); + + if (gradOutput != NULL) { + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth); + } +} + +static THTensor* THNN_(view_weight_local)(THTensor *_weight) +{ + THTensor *weight = THTensor_(newContiguous)(_weight); + THArgCheck(weight->nDimension == 3 || weight->nDimension == 6, 4, + "weight tensor should be 3D or 6D - got %dD", weight->nDimension); + if (weight->nDimension == 6) { + long s1 = weight->size[0] * weight->size[1]; + long s2 = weight->size[2]; + long s3 = weight->size[3] * weight->size[4] * weight->size[5]; + THTensor *old_weight = weight; + weight = THTensor_(newWithStorage3d)(weight->storage, + weight->storageOffset, + s1, -1, s2, -1, s3, -1); + THTensor_(free)(old_weight); + } + return weight; +} + +static void THNN_(SpatialConvolutionLocal_updateOutput_frame) + ( + THTensor *input, THTensor *output, + THTensor *weight, THTensor *bias, THTensor *finput, + int kW, int kH, int dW, int dH, int padW, int padH, + long nInputPlane, long inputWidth, long inputHeight, + long nOutputPlane, long outputWidth, long outputHeight) +{ + long i; + THTensor *output3d, *finput3d; + + THNN_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH, + nInputPlane, inputWidth, inputHeight, + outputWidth, outputHeight); + + THTensor_(copy)(output, bias); + + output3d = THTensor_(newWithStorage3d) + (output->storage, output->storageOffset, + outputHeight * outputWidth, 1, + nOutputPlane, outputHeight * outputWidth, + 1, nOutputPlane * outputHeight * outputWidth); + + finput3d = THTensor_(newWithStorage3d) + (finput->storage, finput->storageOffset, + outputHeight * outputWidth, 1, + kW * kH * nInputPlane, outputHeight * outputWidth, + 1, kW * kH * nInputPlane * outputHeight * outputWidth); + + // weight: oH*oW x nOutputPlane x nInputPlane*kH*kW + // finput3d: oH*oW x nInputPlane*kH*kW x 1 + THTensor_(baddbmm)(output3d, 1.0, output3d, 1.0, weight, finput3d); + // output3d: oH*oW x nOutputPlane x 1 + + THTensor_(free)(output3d); + THTensor_(free)(finput3d); +} + +void THNN_(SpatialConvolutionLocal_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + THTensor *finput, + THTensor *fgradInput, + int kW, int kH, + int dW, int dH, + int padW, int padH, + long inputWidth, long inputHeight, + long outputWidth, long outputHeight) +{ + weight = THNN_(view_weight_local)(weight); + + THNN_(SpatialConvolutionLocal_shapeCheck) + (input, NULL, weight, bias, kH, kW, dH, dW, padH, padW, + inputHeight, inputWidth, outputHeight, outputWidth); + + input = THTensor_(newContiguous)(input); + + long nInputPlane = THTensor_(size)(weight, 2)/ (kW * kH); + long nOutputPlane = THTensor_(size)(weight, 1); + + if(input->nDimension == 3) + { + THTensor_(resize2d)(finput, kW*kH*nInputPlane, outputHeight*outputWidth); + THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth); + + THNN_(SpatialConvolutionLocal_updateOutput_frame) + (input, output, weight, bias, finput, + kW, kH, dW, dH, padW, padH, + nInputPlane, inputWidth, inputHeight, + nOutputPlane, outputWidth, outputHeight); + } + else + { + long T = input->size[0]; + long t; + + THTensor_(resize3d)(finput, T, kW*kH*nInputPlane, outputHeight*outputWidth); + THTensor_(resize4d)(output, T, nOutputPlane, outputHeight, outputWidth); + +#pragma omp parallel for private(t) + for(t = 0; t < T; t++) + { + THTensor *input_t = THTensor_(newSelect)(input, 0, t); + THTensor *output_t = THTensor_(newSelect)(output, 0, t); + THTensor *finput_t = THTensor_(newSelect)(finput, 0, t); + + THNN_(SpatialConvolutionLocal_updateOutput_frame) + (input_t, output_t, weight, bias, finput_t, + kW, kH, dW, dH, padW, padH, + nInputPlane, inputWidth, inputHeight, + nOutputPlane, outputWidth, outputHeight); + + THTensor_(free)(input_t); + THTensor_(free)(output_t); + THTensor_(free)(finput_t); + } + } + + THTensor_(free)(input); + THTensor_(free)(weight); +} + + +static void THNN_(SpatialConvolutionLocal_updateGradInput_frame) + (THTensor *gradInput, THTensor *gradOutput, + THTensor *weight, THTensor *fgradInput, + int kW, int kH, int dW, int dH, int padW, int padH, + long nInputPlane, long inputWidth, long inputHeight, + long nOutputPlane, long outputWidth, long outputHeight) +{ + THTensor *gradOutput3d, *fgradInput3d; + gradOutput3d = THTensor_(newWithStorage3d)(gradOutput->storage, gradOutput->storageOffset, + outputHeight*outputWidth, 1, + nOutputPlane, outputHeight*outputWidth, + 1, nOutputPlane*outputHeight*outputWidth); + fgradInput3d = THTensor_(newWithStorage3d)(fgradInput->storage, fgradInput->storageOffset, + outputHeight*outputWidth, 1, + kW*kH*nInputPlane, outputHeight*outputWidth, + 1, kW*kH*nInputPlane*outputHeight*outputWidth); + // weight: oH*oW x nInputPlane*kH*kW x nOutputPlane + // gradOutput3d: oH*oW x nOutputPlane x 1 + THTensor_(baddbmm)(fgradInput3d, 0.0, fgradInput3d, 1.0, weight, gradOutput3d); + // fgradInput3d: oH*oW x nInputPlane*kH*kW x 1 + + THTensor_(free)(gradOutput3d); + THTensor_(free)(fgradInput3d); + + THTensor_(zero)(gradInput); + + THNN_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH, padW, padH, + nInputPlane, inputWidth, inputHeight, + outputWidth, outputHeight); + +} + +void THNN_(SpatialConvolutionLocal_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *finput, + THTensor *fgradInput, + int kW, int kH, + int dW, int dH, + int padW, int padH, + long inputWidth, long inputHeight, + long outputWidth, long outputHeight) +{ + weight = THNN_(view_weight_local)(weight); + + THNN_(SpatialConvolutionLocal_shapeCheck) + (input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW, + inputHeight, inputWidth, outputHeight, outputWidth); + + input = THTensor_(newContiguous)(input); + gradOutput = THTensor_(newContiguous)(gradOutput); + long nInputPlane = THTensor_(size)(weight,2)/(kW*kH); + long nOutputPlane = THTensor_(size)(weight,1); + + THTensor_(resizeAs)(gradInput, input); + THTensor_(resizeAs)(fgradInput, finput); + + THTensor *tweight = THTensor_(new)(); + THTensor_(transpose)(tweight, weight, 1, 2); + + if(input->nDimension == 3) + { + THNN_(SpatialConvolutionLocal_updateGradInput_frame) + (gradInput, gradOutput, tweight, + fgradInput, kW, kH, dW, dH, padW, padH, + nInputPlane, inputWidth, inputHeight, + nOutputPlane, outputWidth, outputHeight); + } + else + { + long T = input->size[0]; + long t; + +#pragma omp parallel for private(t) + for(t = 0; t < T; t++) + { + THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t); + THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t); + THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t); + + THNN_(SpatialConvolutionLocal_updateGradInput_frame) + (gradInput_t, gradOutput_t, tweight, fgradInput_t, + kW, kH, dW, dH, padW, padH, + nInputPlane, inputWidth, inputHeight, + nOutputPlane, outputWidth, outputHeight); + + THTensor_(free)(gradInput_t); + THTensor_(free)(gradOutput_t); + THTensor_(free)(fgradInput_t); + } + } + + THTensor_(free)(tweight); + THTensor_(free)(input); + THTensor_(free)(gradOutput); + THTensor_(free)(weight); +} + +static void THNN_(SpatialConvolutionLocal_accGradParameters_frame) + (THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, + THTensor *finput, real scale, + int kW, int kH, int dW, int dH, int padW, int padH, + long nInputPlane, long inputWidth, long inputHeight, + long nOutputPlane, long outputWidth, long outputHeight) +{ + + THTensor *gradOutput3d, *finput3d; + gradOutput3d = THTensor_(newWithStorage3d)(gradOutput->storage, gradOutput->storageOffset, + outputHeight*outputWidth, 1, + nOutputPlane, outputHeight*outputWidth, + 1, nOutputPlane*outputHeight*outputWidth); + finput3d = THTensor_(newWithStorage3d)(finput->storage, finput->storageOffset, + outputHeight*outputWidth, 1, + 1, kW*kH*nInputPlane*outputHeight*outputWidth, + kW*kH*nInputPlane, outputHeight*outputWidth); + // gradOutput3d: oH*oW x nOutputPlane x 1 + // finput3d: oH*oW x 1 x kW*kH*nInputPlane + THTensor_(baddbmm)(gradWeight, 1.0, gradWeight, scale, gradOutput3d, finput3d); + // gradWeight: oH*oW x nOutputPlane x kW*kH*nInputPlane + + THTensor_(cadd)(gradBias, gradBias, scale, gradOutput); + + THTensor_(free)(gradOutput3d); + THTensor_(free)(finput3d); +} + +void THNN_(SpatialConvolutionLocal_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *finput, + THTensor *fgradInput, + int kW, int kH, + int dW, int dH, + int padW, int padH, + long inputWidth, long inputHeight, + long outputWidth, long outputHeight, + accreal scale_) +{ + THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous"); + THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous"); + real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); + gradWeight = THNN_(view_weight_local)(gradWeight); + + THNN_(SpatialConvolutionLocal_shapeCheck) + (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW, + inputHeight, inputWidth, outputHeight, outputWidth); + + input = THTensor_(newContiguous)(input); + gradOutput = THTensor_(newContiguous)(gradOutput); + + long nInputPlane = THTensor_(size)(gradWeight,2)/(kW*kH); + long nOutputPlane = THTensor_(size)(gradWeight,1); + + if(input->nDimension == 3) + { + THNN_(SpatialConvolutionLocal_accGradParameters_frame) + (gradOutput, gradWeight, gradBias, finput, scale, + kW, kH, dW, dH, padW, padH, + nInputPlane, inputWidth, inputHeight, + nOutputPlane, outputWidth, outputHeight); + } + else + { + long T = input->size[0]; + long t; + + for(t = 0; t < T; t++) + { + THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t); + THTensor *finput_t = THTensor_(newSelect)(finput, 0, t); + + THNN_(SpatialConvolutionLocal_accGradParameters_frame) + (gradOutput_t, gradWeight, gradBias, finput_t, scale, + kW, kH, dW, dH, padW, padH, + nInputPlane, inputWidth, inputHeight, + nOutputPlane, outputWidth, outputHeight); + + THTensor_(free)(gradOutput_t); + THTensor_(free)(finput_t); + } + } + + THTensor_(free)(input); + THTensor_(free)(gradOutput); + THTensor_(free)(gradWeight); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialConvolutionMM.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialConvolutionMM.c new file mode 100644 index 000000000..28fea517c --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialConvolutionMM.c @@ -0,0 +1,377 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SpatialConvolutionMM.c" +#else + +static inline void THNN_(SpatialConvolutionMM_shapeCheck)( + THTensor *input, THTensor *gradOutput, + THTensor *weight, THTensor *bias, + int kH, int kW, int dH, int dW, int padH, int padW) { + + THArgCheck(kW > 0 && kH > 0, 9, + "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW); + THArgCheck(dW > 0 && dH > 0, 11, + "stride should be greater than zero, but got dH: %d dW: %d", dH, dW); + THNN_ARGCHECK(weight->nDimension == 2 || weight->nDimension == 4, 5, weight, + "2D or 4D weight tensor expected, but got: %s"); + + if (bias != NULL) { + THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]); + } + + int ndim = input->nDimension; + int dimf = 0; + int dimh = 1; + int dimw = 2; + + if (ndim == 4) { + dimf++; + dimh++; + dimw++; + } + + THNN_ARGCHECK(ndim == 3 || ndim == 4, 2, input, + "3D or 4D input tensor expected but got: %s"); + + long nInputPlane = weight->size[1] / (kH * kW); + long inputHeight = input->size[dimh]; + long inputWidth = input->size[dimw]; + long nOutputPlane = weight->size[0]; + long outputHeight = (inputHeight + 2*padH - kH) / dH + 1; + long outputWidth = (inputWidth + 2*padW - kW) / dW + 1; + + if (outputWidth < 1 || outputHeight < 1) + THError("Given input size: (%d x %d x %d). " + "Calculated output size: (%d x %d x %d). Output size is too small", + nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth); + + THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane); + + if (gradOutput != NULL) { + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth); + } +} + +static THTensor* THNN_(view_weight_MM2d)(THTensor *weight) { + weight = THTensor_(newContiguous)(weight); + if (weight->nDimension == 4) { + long s1 = weight->size[0]; + long s2 = weight->size[1] * weight->size[2] * weight->size[3]; + THTensor *old_weight = weight; + weight = THTensor_(newWithStorage2d)(weight->storage, weight->storageOffset, + s1, -1, s2, -1); + THTensor_(free)(old_weight); + } + return weight; +} + +static void THNN_(SpatialConvolutionMM_updateOutput_frame)( + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + THTensor *finput, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH, + long nInputPlane, + long inputWidth, + long inputHeight, + long nOutputPlane, + long outputWidth, + long outputHeight) +{ + long i; + THTensor *output2d; + + THNN_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH, + nInputPlane, inputWidth, inputHeight, + outputWidth, outputHeight); + + output2d = THTensor_(newWithStorage2d)(output->storage, output->storageOffset, + nOutputPlane, -1, + outputHeight*outputWidth, -1); + if (bias) { + for(i = 0; i < nOutputPlane; i++) + THVector_(fill) + (output->storage->data + output->storageOffset + output->stride[0] * i, + THTensor_(get1d)(bias, i), outputHeight*outputWidth); + } else { + THTensor_(zero)(output); + } + + THTensor_(addmm)(output2d, 1, output2d, 1, weight, finput); + + THTensor_(free)(output2d); +} + +void THNN_(SpatialConvolutionMM_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + THTensor *finput, + THTensor *fgradInput, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH) +{ + weight = THNN_(view_weight_MM2d)(weight); + + THNN_(SpatialConvolutionMM_shapeCheck) + (input, NULL, weight, bias, kH, kW, dH, dW, padH, padW); + + input = THTensor_(newContiguous)(input); + int ndim = input->nDimension; + int dimf = 0; + int dimh = 1; + int dimw = 2; + + if (ndim == 4) { + dimf++; + dimh++; + dimw++; + } + + long nInputPlane = input->size[dimf]; + long inputHeight = input->size[dimh]; + long inputWidth = input->size[dimw]; + long nOutputPlane = weight->size[0]; + long outputHeight = (inputHeight + 2*padH - kH) / dH + 1; + long outputWidth = (inputWidth + 2*padW - kW) / dW + 1; + + if(input->nDimension == 3) + { + THTensor_(resize2d)(finput, kW*kH*nInputPlane, outputHeight*outputWidth); + THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth); + + THNN_(SpatialConvolutionMM_updateOutput_frame) + (input, output, weight, bias, finput, + kW, kH, dW, dH, padW, padH, + nInputPlane, inputWidth, inputHeight, + nOutputPlane, outputWidth, outputHeight); + } + else + { + long T = input->size[0]; + long t; + + THTensor_(resize3d)(finput, T, kW*kH*nInputPlane, outputHeight*outputWidth); + THTensor_(resize4d)(output, T, nOutputPlane, outputHeight, outputWidth); + +#pragma omp parallel for private(t) + for(t = 0; t < T; t++) + { + THTensor *input_t = THTensor_(newSelect)(input, 0, t); + THTensor *output_t = THTensor_(newSelect)(output, 0, t); + THTensor *finput_t = THTensor_(newSelect)(finput, 0, t); + + THNN_(SpatialConvolutionMM_updateOutput_frame) + (input_t, output_t, weight, bias, finput_t, + kW, kH, dW, dH, padW, padH, + nInputPlane, inputWidth, inputHeight, + nOutputPlane, outputWidth, outputHeight); + + THTensor_(free)(input_t); + THTensor_(free)(output_t); + THTensor_(free)(finput_t); + } + } + + THTensor_(free)(input); + THTensor_(free)(weight); +} + +static void THNN_(SpatialConvolutionMM_updateGradInput_frame)( + THTensor *gradInput, + THTensor *gradOutput, + THTensor *weight, + THTensor *fgradInput, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH) +{ + THTensor *gradOutput2d = THTensor_(newWithStorage2d) + (gradOutput->storage, gradOutput->storageOffset, + gradOutput->size[0], -1, + gradOutput->size[1]*gradOutput->size[2], -1); + THTensor_(addmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput2d); + THTensor_(free)(gradOutput2d); + + THTensor_(zero)(gradInput); + + THNN_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH, + padW, padH, + gradInput->size[0], gradInput->size[2], gradInput->size[1], + gradOutput->size[2], gradOutput->size[1]); +} + +void THNN_(SpatialConvolutionMM_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *finput, + THTensor *fgradInput, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH) +{ + weight = THNN_(view_weight_MM2d)(weight); + + THNN_(SpatialConvolutionMM_shapeCheck) + (input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW); + + input = THTensor_(newContiguous)(input); + gradOutput = THTensor_(newContiguous)(gradOutput); + + THTensor_(resizeAs)(gradInput, input); + THTensor_(resizeAs)(fgradInput, finput); + + // depending on the BLAS library, fgradInput (result tensor) might + // be left uninitialized on zero alpha, which might lead to weird behavior + // hence, to be safe, zero it + THTensor_(zero)(fgradInput); + THTensor *tweight = THTensor_(new)(); + THTensor_(transpose)(tweight, weight, 0, 1); + + if(input->nDimension == 3) + { + THNN_(SpatialConvolutionMM_updateGradInput_frame)(gradInput, gradOutput, + tweight, fgradInput, + kW, kH, dW, dH, padW, padH); + } + else + { + long T = input->size[0]; + long t; + +#pragma omp parallel for private(t) + for(t = 0; t < T; t++) + { + THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t); + THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t); + THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t); + + THNN_(SpatialConvolutionMM_updateGradInput_frame)(gradInput_t, gradOutput_t, + tweight, fgradInput_t, + kW, kH, dW, dH, padW, padH); + + THTensor_(free)(gradInput_t); + THTensor_(free)(gradOutput_t); + THTensor_(free)(fgradInput_t); + } + } + + THTensor_(free)(tweight); + THTensor_(free)(input); + THTensor_(free)(gradOutput); + THTensor_(free)(weight); +} + +static void THNN_(SpatialConvolutionMM_accGradParameters_frame)( + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *finput, + real scale) +{ + long i; + THTensor *gradOutput2d = THTensor_(newWithStorage2d) + (gradOutput->storage, gradOutput->storageOffset, + gradOutput->size[0], -1, + gradOutput->size[1]*gradOutput->size[2], -1); + + THTensor *tfinput = THTensor_(new)(); + THTensor_(transpose)(tfinput, finput, 0, 1); + THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutput2d, tfinput); + THTensor_(free)(tfinput); + + if (gradBias) { + for(i = 0; i < gradBias->size[0]; i++) + { + long k; + real sum = 0; + real *data = gradOutput2d->storage->data + gradOutput2d->storageOffset + i*gradOutput2d->stride[0]; + for(k = 0; k < gradOutput2d->size[1]; k++) + sum += data[k]; + (gradBias->storage->data + gradBias->storageOffset)[i] += scale*sum; + } + } + + THTensor_(free)(gradOutput2d); +} + +void THNN_(SpatialConvolutionMM_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *finput, + THTensor *fgradInput, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH, + accreal scale_) +{ + THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous"); + if (gradBias) + THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous"); + + real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); + gradWeight = THNN_(view_weight_MM2d)(gradWeight); + + THNN_(SpatialConvolutionMM_shapeCheck) + (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW); + + input = THTensor_(newContiguous)(input); + gradOutput = THTensor_(newContiguous)(gradOutput); + + if(input->nDimension == 3) + { + THNN_(SpatialConvolutionMM_accGradParameters_frame)(gradOutput, gradWeight, + gradBias, finput, scale); + } + else + { + long T = input->size[0]; + long t; + + for(t = 0; t < T; t++) + { + THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t); + THTensor *finput_t = THTensor_(newSelect)(finput, 0, t); + + THNN_(SpatialConvolutionMM_accGradParameters_frame)(gradOutput_t, gradWeight, + gradBias, finput_t, scale); + + THTensor_(free)(gradOutput_t); + THTensor_(free)(finput_t); + } + } + + THTensor_(free)(input); + THTensor_(free)(gradOutput); + THTensor_(free)(gradWeight); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialConvolutionMap.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialConvolutionMap.c new file mode 100644 index 000000000..142a03551 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialConvolutionMap.c @@ -0,0 +1,277 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SpatialConvolutionMap.c" +#else + +void THNN_(SpatialConvolutionMap_updateOutput)( + THNNState *state, THTensor *input, THTensor *output, THTensor *weight, THTensor *bias, + THTensor *connTable, int nInputPlane, int nOutputPlane, + int dW, int dH) +{ + THArgCheck( + weight != NULL && weight->nDimension == 3 + && connTable != NULL && connTable->size[0] == weight->size[0], 4, + "3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE + ); + + int dimw = 2; + int dimh = 1; + int dimc = 0; + long nbatch = 1; + + THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D(batch mode) tensor expected"); + + if (input->nDimension == 4) + { + nbatch = input->size[0]; + dimc++; + dimw++; + dimh++; + } + + const long kH = weight->size[1]; + const long kW = weight->size[2]; + + THArgCheck(input->size[dimc] >= nInputPlane, 2, "invalid number of input planes"); + THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH, 2, "input image smaller than kernel size"); + + const long input_w = input->size[dimw]; + const long input_h = input->size[dimh]; + const long output_w = (input_w - kW) / dW + 1; + const long output_h = (input_h - kH) / dH + 1; + + if (input->nDimension == 3) + THTensor_(resize3d)(output, nOutputPlane, output_h, output_w); + else + THTensor_(resize4d)(output, input->size[0], nOutputPlane, output_h, output_w); + + /* contiguous */ + input = THTensor_(newContiguous)(input); + output = THTensor_(newContiguous)(output); + weight = THTensor_(newContiguous)(weight); + bias = bias ? THTensor_(newContiguous)(bias) : bias; + connTable = THTensor_(newContiguous)(connTable); + + /* get raw pointers */ + real *input_data = THTensor_(data)(input); + real *output_data = THTensor_(data)(output); + real *weight_data = THTensor_(data)(weight); + real *bias_data = THTensor_(data)(bias); + real *connTable_data = THTensor_(data)(connTable); + + long p; +#pragma omp parallel for private(p) + for (p = 0; p < nOutputPlane; p++) + { + long m; + for (m = 0; m < nbatch; m++) + { + /* add bias */ + real *ptr_output = output_data + p*output_w*output_h + m*nOutputPlane*output_w*output_h; + long j, k; + real z= bias_data[p]; + for (j = 0; j < output_h*output_w; j++) + ptr_output[j] = z; + + /* convolve all maps */ + int nweight = connTable->size[0]; + for (k = 0; k < nweight; k++) + { + /* get offsets for input/output */ + int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE; + int i = (int)connTable_data[k*2+0] - TH_INDEX_BASE; + + if (o == p) + { + THTensor_(validXCorr2Dptr)( + output_data + o*output_w*output_h + m*nOutputPlane*output_w*output_h, + 1.0, + input_data + i*input_w*input_h + m*nInputPlane*input_w*input_h, input_h, input_w, + weight_data + k*kW*kH, + kH, kW, + dH, dW + ); + } + } + } + } + + /* clean up */ + THTensor_(free)(input); + THTensor_(free)(output); + THTensor_(free)(weight); + if (bias) THTensor_(free)(bias); + THTensor_(free)(connTable); +} + +void THNN_(SpatialConvolutionMap_updateGradInput)( + THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput, THTensor *weight, THTensor *bias, + THTensor *connTable, int nInputPlane, int nOutputPlane, + int dW, int dH) +{ + THArgCheck( + weight != NULL && weight->nDimension == 3 + && connTable != NULL && connTable->size[0] == weight->size[0], 5, + "3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE + ); + + /* and dims */ + int dimw = 2; + int dimh = 1; + long nbatch = 1; + if (input->nDimension == 4) + { + nbatch = input->size[0]; + dimw++; + dimh++; + } + + const long input_h = input->size[dimh]; + const long input_w = input->size[dimw]; + const long output_h = gradOutput->size[dimh]; + const long output_w = gradOutput->size[dimw]; + const long kH = weight->size[1]; + const long kW = weight->size[2]; + + /* contiguous */ + gradInput = THTensor_(newContiguous)(gradInput); + gradOutput = THTensor_(newContiguous)(gradOutput); + weight = THTensor_(newContiguous)(weight); + connTable = THTensor_(newContiguous)(connTable); + + /* Resize/Zero */ + THTensor_(resizeAs)(gradInput, input); + THTensor_(zero)(gradInput); + + /* get raw pointers */ + real *gradInput_data = THTensor_(data)(gradInput); + real *gradOutput_data = THTensor_(data)(gradOutput); + real *weight_data = THTensor_(data)(weight); + real *connTable_data = THTensor_(data)(connTable); + + long p; +#pragma omp parallel for private(p) + for (p = 0; p < nInputPlane; p++) + { + long m; + for (m = 0; m < nbatch; m++) + { + long k; + /* backward all */ + int nkernel = connTable->size[0]; + for (k = 0; k < nkernel; k++) + { + int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE; + int i = (int)connTable_data[k*2+0] - TH_INDEX_BASE; + if (i == p) + { + /* gradient to input */ + THTensor_(fullConv2Dptr)( + gradInput_data + i*input_w*input_h + m*nInputPlane*input_w*input_h, 1.0, + gradOutput_data + o*output_w*output_h + m*nOutputPlane*output_w*output_h, output_h, output_w, + weight_data + k*kW*kH, kH, kW, dH, dW + ); + } + } + } + } + + /* clean up */ + THTensor_(free)(gradInput); + THTensor_(free)(gradOutput); + THTensor_(free)(weight); + THTensor_(free)(connTable); +} + +void THNN_(SpatialConvolutionMap_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *connTable, + int nInputPlane, + int nOutputPlane, + int dW, int dH, + accreal scale_) +{ + real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); + THArgCheck( + gradWeight != NULL && gradWeight->nDimension == 3 + && connTable != NULL && connTable->size[0] == gradWeight->size[0], 5, + "3D gradWeight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE + ); + + /* and dims */ + int dimw = 2; + int dimh = 1; + long nbatch = 1; + if (input->nDimension == 4) + { + nbatch = input->size[0]; + dimw++; + dimh++; + } + + const long input_h = input->size[dimh]; + const long input_w = input->size[dimw]; + const long output_h = gradOutput->size[dimh]; + const long output_w = gradOutput->size[dimw]; + const long kH = gradWeight->size[1]; + const long kW = gradWeight->size[2]; + + /* contiguous */ + input = THTensor_(newContiguous)(input); + gradOutput = THTensor_(newContiguous)(gradOutput); + THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous"); + THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous"); + + /* get raw pointers */ + real *input_data = THTensor_(data)(input); + real *gradOutput_data = THTensor_(data)(gradOutput); + real *gradWeight_data = THTensor_(data)(gradWeight); + real *gradBias_data = THTensor_(data)(gradBias); + + + long k; + /* gradients wrt bias */ +#pragma omp parallel for private(k) + for (k = 0; k < nOutputPlane; k++) + { + long m; + for (m = 0; m < nbatch; m++) + { + real *ptr_gradOutput = gradOutput_data + k*output_w*output_h + m*nOutputPlane*output_w*output_h; + long l; + for (l = 0; l < output_h*output_w; l++) + gradBias_data[k] += scale*ptr_gradOutput[l]; + } + } + + /* gradients wrt weight */ + const int nkernel = connTable->size[0]; +#pragma omp parallel for private(k) + for (k = 0; k < nkernel; k++) + { + long m; + for (m = 0; m < nbatch; m++) + { + int o = (int)THTensor_(get2d)(connTable,k,1) - TH_INDEX_BASE; + int i = (int)THTensor_(get2d)(connTable,k,0) - TH_INDEX_BASE; + + /* gradient to kernel */ + THTensor_(validXCorr2DRevptr)( + gradWeight_data + k*kW*kH, + scale, + input_data + i*input_w*input_h + m*nInputPlane*input_w*input_h, input_h, input_w, + gradOutput_data + o*output_w*output_h + m*nOutputPlane*output_w*output_h , output_h, output_w, + dH, dW + ); + } + } + + /* clean up */ + THTensor_(free)(input); + THTensor_(free)(gradOutput); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialDepthWiseConvolution.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialDepthWiseConvolution.c new file mode 100644 index 000000000..efb66a3e3 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialDepthWiseConvolution.c @@ -0,0 +1,528 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SpatialDepthWiseConvolution.c" +#else + +static inline void THNN_(SpatialDepthWiseConvolution_shapeCheck)( + THTensor *input, THTensor *gradOutput, + THTensor *weight, THTensor *bias, + int kH, int kW, int dH, int dW, int padH, int padW) { + + THArgCheck(kW > 0 && kH > 0, 9, + "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW); + THArgCheck(dW > 0 && dH > 0, 11, + "stride should be greater than zero, but got dH: %d dW: %d", dH, dW); + THNN_ARGCHECK(weight->nDimension == 4, 5, weight, + "2D or 4D weight tensor expected, but got: %s"); + + if (bias != NULL) { + THNN_CHECK_DIM_SIZE(bias, 2, 0, weight->size[0]); + THNN_CHECK_DIM_SIZE(bias, 2, 1, weight->size[1]); + } + + int ndim = input->nDimension; + int dimf = 0; + int dimh = 1; + int dimw = 2; + + if (ndim == 4) { + dimf++; + dimh++; + dimw++; + } + + THNN_ARGCHECK(ndim == 3 || ndim == 4, 2, input, + "3D or 4D input tensor expected but got: %s"); + + long nInputPlane = weight->size[1]; + long inputHeight = input->size[dimh]; + long inputWidth = input->size[dimw]; + long nOutputPlane = weight->size[0]; + long outputHeight = (inputHeight + 2*padH - kH) / dH + 1; + long outputWidth = (inputWidth + 2*padW - kW) / dW + 1; + + if (outputWidth < 1 || outputHeight < 1) + THError("Given input size: (%d x %d x %d). " + "Calculated output size: (%d x %d x %d). Output size is too small", + nInputPlane,inputHeight,inputWidth,nOutputPlane*nInputPlane,outputHeight,outputWidth); + + THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane); + + if (gradOutput != NULL) { + THNN_CHECK_DIM_SIZE(gradOutput, ndim + 1, dimf, nInputPlane); + THNN_CHECK_DIM_SIZE(gradOutput, ndim + 1, dimh, nOutputPlane); + THNN_CHECK_DIM_SIZE(gradOutput, ndim + 1, dimw, outputHeight); + THNN_CHECK_DIM_SIZE(gradOutput, ndim + 1, dimw + 1, outputWidth); + } +} + +static void THNN_(SpatialDepthWiseConvolution_updateOutput_frame)( + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + THTensor *finput, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH, + long nInputPlane, + long inputWidth, + long inputHeight, + long nOutputPlane, + long outputWidth, + long outputHeight) +{ + long i; + THTensor *output2d; + + THNN_(unfolded_copy)(finput, input, kW, kH, dW, dH, padW, padH, + nInputPlane, inputWidth, inputHeight, + outputWidth, outputHeight); + + output2d = THTensor_(newWithStorage2d)(output->storage, output->storageOffset, + nOutputPlane, -1, + outputHeight*outputWidth, -1); + if (bias) { + for(i = 0; i < nOutputPlane; i++) + THVector_(fill) + (output->storage->data + output->storageOffset + output->stride[0] * i, + THTensor_(get1d)(bias, i), outputHeight*outputWidth); + } else { + THTensor_(zero)(output); + } + + THTensor_(addmm)(output2d, 1, output2d, 1, weight, finput); + + THTensor_(free)(output2d); +} + +void THNN_(SpatialDepthWiseConvolution_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + THTensor *finput, + THTensor *fgradInput, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH) +{ + long nInputPlane = weight->nDimension == 2 ? weight->size[1]/(kH*kW) : weight->size[1]; + long nOutputPlane = weight->size[0]; + if (weight->nDimension == 2) { + THTensor_(resize4d)(weight, nOutputPlane, nInputPlane, kH, kW); + } + + THNN_(SpatialDepthWiseConvolution_shapeCheck) + (input, NULL, weight, bias, kH, kW, dH, dW, padH, padW); + + THTensor *_weight = THTensor_(newTranspose)(weight, 0, 1); + weight = THTensor_(newContiguous)(_weight); + + THTensor *_bias = NULL; + if(bias) { + _bias = THTensor_(newTranspose)(bias, 0, 1); + bias = THTensor_(newContiguous)(_bias); + } + + // resize weight + long s1 = weight->size[0]; + long s2 = weight->size[1]; + long s3 = weight->size[2] * weight->size[3]; + weight = THTensor_(newWithStorage3d)(weight->storage, weight->storageOffset, + s1, -1, s2, -1, s3, -1); + + input = THTensor_(newContiguous)(input); + + int ndim = input->nDimension; + + int batch = 1; + if (ndim == 3) { + // Force batch + batch = 0; + THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]); + } + + long inputHeight = input->size[3]; + long inputWidth = input->size[2]; + long outputHeight = (inputHeight + 2*padH - kH) / dH + 1; + long outputWidth = (inputWidth + 2*padW - kW) / dW + 1; + + long T = input->size[0]; + long t; + + THTensor_(resize5d)(output, T, nInputPlane, nOutputPlane, outputHeight, outputWidth); + THTensor_(resize4d)(finput, T, nInputPlane, kW*kH*1, outputHeight*outputWidth); + +#pragma omp parallel for private(t) + for(t = 0; t < T; t++) + { + THTensor *input_t = THTensor_(newSelect)(input, 0, t); + THTensor *output_t = THTensor_(newSelect)(output, 0, t); + THTensor *finput_t = THTensor_(newSelect)(finput, 0, t); + + long i; +#pragma omp parallel for private(i) + for(i = 0; i < nInputPlane; i++) + { + THTensor *weight_i = THTensor_(newSelect)(weight, 0, i); + THTensor *input_i = THTensor_(newNarrow)(input_t, 0, i, 1); + THTensor *output_i = THTensor_(newSelect)(output_t, 0, i); + THTensor *finput_i = THTensor_(newSelect)(finput_t, 0, i); + THTensor *bias_i = NULL; + if(bias) { + bias_i = THTensor_(newSelect)(bias, 0, i); + } + THNN_(SpatialDepthWiseConvolution_updateOutput_frame) + (input_i, output_i, weight_i, bias_i, finput_i, + kW, kH, dW, dH, padW, padH, + 1, inputWidth, inputHeight, + nOutputPlane, outputWidth, outputHeight); + + THTensor_(free)(input_i); + THTensor_(free)(weight_i); + THTensor_(free)(bias_i); + THTensor_(free)(output_i); + THTensor_(free)(finput_i); + } + THTensor_(free)(input_t); + THTensor_(free)(output_t); + THTensor_(free)(finput_t); + } + + THTensor_(free)(weight); + THTensor_(free)(_weight); + THTensor_(free)(bias); + THTensor_(free)(_bias); + THTensor_(resize4d)(output, T, nInputPlane * nOutputPlane, outputHeight, outputWidth); + + if (batch == 0) { + THTensor_(select)(output, NULL, 0, 0); + THTensor_(select)(input, NULL, 0, 0); + THTensor_(select)(finput, NULL, 0, 0); + } + THTensor_(free)(input); +} + +static void THNN_(SpatialDepthWiseConvolution_updateGradInput_frame)( + THTensor *gradInput, + THTensor *gradOutput, + THTensor *weight, + THTensor *fgradInput, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH) +{ + THTensor *gradOutput2d = THTensor_(newWithStorage2d) + (gradOutput->storage, gradOutput->storageOffset, + gradOutput->size[0], -1, + gradOutput->size[1]*gradOutput->size[2], -1); + THTensor_(addmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput2d); + THTensor_(free)(gradOutput2d); + + THTensor_(zero)(gradInput); + + THNN_(unfolded_acc)(fgradInput, gradInput, kW, kH, dW, dH, + padW, padH, + gradInput->size[0], gradInput->size[2], gradInput->size[1], + gradOutput->size[2], gradOutput->size[1]); +} + +void THNN_(SpatialDepthWiseConvolution_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *finput, + THTensor *fgradInput, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH) +{ + long nInputPlane = weight->nDimension == 2 ? weight->size[1]/(kH*kW) : weight->size[1]; + long nOutputPlane = weight->size[0]; + if (weight->nDimension == 2) { + THTensor_(resize4d)(weight, nOutputPlane, nInputPlane, kH, kW); + } + gradOutput = THTensor_(newWithTensor)(gradOutput); + + if (input->nDimension == 3) { + if (gradOutput->nDimension == 3) { + THTensor_(resize4d)(gradOutput, nInputPlane, nOutputPlane, gradOutput->size[1], gradOutput->size[2]); + } + } + else + { + if (gradOutput->nDimension == 4) { + THTensor_(resize5d)(gradOutput, gradOutput->size[0], nInputPlane, nOutputPlane, gradOutput->size[2], gradOutput->size[3]); + } + } + + + THNN_(SpatialDepthWiseConvolution_shapeCheck) + (input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW); + + THTensor *_weight = THTensor_(newTranspose)(weight, 0, 1); + weight = THTensor_(newContiguous)(_weight); + + + // resize weight + long s1 = weight->size[0]; + long s2 = weight->size[1]; + long s3 = weight->size[2] * weight->size[3]; + weight = THTensor_(newWithStorage3d)(weight->storage, weight->storageOffset, + s1, -1, s2, -1, s3, -1); + + input = THTensor_(newContiguous)(input); + + int batch = 1; + if (input->nDimension == 3) { + // Force batch + batch = 0; + THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]); + THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]); + } + + long inputHeight = input->size[3]; + long inputWidth = input->size[2]; + long outputHeight = (inputHeight + 2*padH - kH) / dH + 1; + long outputWidth = (inputWidth + 2*padW - kW) / dW + 1; + + long T = input->size[0]; + long t; + + THTensor_(resizeAs)(gradInput, input); + THTensor_(resize4d)(fgradInput, T, nInputPlane, kW*kH*1, outputHeight*outputWidth); + + // depending on the BLAS library, fgradInput (result tensor) might + // be left uninitialized on zero alpha, which might lead to weird behavior + // hence, to be safe, zero it + THTensor_(zero)(fgradInput); + + + +#pragma omp parallel for private(t) + for(t = 0; t < T; t++) + { + THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t); + THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t); + THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t); + + + long i; +#pragma omp parallel for private(i) + for(i = 0; i < nInputPlane; i++) + { + THTensor *weight_i = THTensor_(newSelect)(weight, 0, i); + THTensor *gradInput_i = THTensor_(newNarrow)(gradInput_t, 0, i, 1); + THTensor *gradOutput_i = THTensor_(newSelect)(gradOutput_t, 0, i); + THTensor *fgradInput_i = THTensor_(newSelect)(fgradInput_t, 0, i); + + THTensor_(transpose)(weight_i, weight_i, 0, 1); + + THNN_(SpatialDepthWiseConvolution_updateGradInput_frame)(gradInput_i, gradOutput_i, + weight_i, fgradInput_i, + kW, kH, dW, dH, padW, padH); + + THTensor_(free)(gradInput_i); + THTensor_(free)(weight_i); + THTensor_(free)(gradOutput_i); + THTensor_(free)(fgradInput_i); + } + + THTensor_(free)(gradInput_t); + THTensor_(free)(gradOutput_t); + THTensor_(free)(fgradInput_t); + } + + if (batch == 0) { + THTensor_(select)(gradOutput, NULL, 0, 0); + THTensor_(select)(input, NULL, 0, 0); + THTensor_(select)(gradInput, NULL, 0, 0); + THTensor_(select)(fgradInput, NULL, 0, 0); + } + + THTensor_(free)(input); + THTensor_(free)(gradOutput); + THTensor_(free)(weight); + THTensor_(free)(_weight); +} + +static void THNN_(SpatialDepthWiseConvolution_accGradParameters_frame)( + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *finput, + accreal scale) +{ + long i; + THTensor *gradOutput2d = THTensor_(newWithStorage2d) + (gradOutput->storage, gradOutput->storageOffset, + gradOutput->size[0], -1, + gradOutput->size[1]*gradOutput->size[2], -1); + + THTensor_(transpose)(finput, finput, 0, 1); + THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutput2d, finput); + THTensor_(transpose)(finput, finput, 0, 1); + + if (gradBias) { + for(i = 0; i < gradBias->size[0]; i++) + { + long k; + real sum = 0; + real *data = gradOutput2d->storage->data + gradOutput2d->storageOffset + i*gradOutput2d->stride[0]; + for(k = 0; k < gradOutput2d->size[1]; k++) + sum += data[k]; + (gradBias->storage->data + gradBias->storageOffset)[i] += scale*sum; + } + } + + THTensor_(free)(gradOutput2d); +} + +void THNN_(SpatialDepthWiseConvolution_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *finput, + THTensor *fgradInput, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH, + accreal scale) +{ + long nInputPlane = gradWeight->nDimension == 2 ? gradWeight->size[1]/(kH*kW) : gradWeight->size[1]; + long nOutputPlane = gradWeight->size[0]; + if (gradWeight->nDimension == 2) { + THTensor_(resize4d)(gradWeight, nOutputPlane, nInputPlane, kH, kW); + } + + gradOutput = THTensor_(newWithTensor)(gradOutput); + if (input->nDimension == 3) { + if (gradOutput->nDimension == 3) { + THTensor_(resize4d)(gradOutput, nInputPlane, nOutputPlane, gradOutput->size[1], gradOutput->size[2]); + } + } + else + { + if (gradOutput->nDimension == 4) { + THTensor_(resize5d)(gradOutput, gradOutput->size[0], nInputPlane, nOutputPlane, gradOutput->size[2], gradOutput->size[3]); + } + } + + + THNN_(SpatialDepthWiseConvolution_shapeCheck) + (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW); + + // Transpose gradWeight & gradBias + THTensor_(transpose)(gradWeight, NULL, 0, 1); + THTensor *_gradWeight; + _gradWeight = gradWeight; + gradWeight = THTensor_(newContiguous)(gradWeight); + + THTensor *_gradBias = NULL; + if(gradBias) { + THTensor_(transpose)(gradBias, NULL, 0, 1); + _gradBias = gradBias; + gradBias = THTensor_(newContiguous)(gradBias); + } + + // resize gradWeight + long s1 = gradWeight->size[0]; + long s2 = gradWeight->size[1]; + long s3 = gradWeight->size[2] * gradWeight->size[3]; + gradWeight = THTensor_(newWithStorage3d)(gradWeight->storage, gradWeight->storageOffset, + s1, -1, s2, -1, s3, -1); + + input = THTensor_(newContiguous)(input); + + + int batch = 1; + if (input->nDimension == 3) { + // Force batch + batch = 0; + THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]); + THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]); + } + + long inputHeight = input->size[3]; + long inputWidth = input->size[2]; + long outputHeight = (inputHeight + 2*padH - kH) / dH + 1; + long outputWidth = (inputWidth + 2*padW - kW) / dW + 1; + + long T = input->size[0]; + long t; + THTensor_(resize4d)(finput, T, nInputPlane, kW*kH*1, outputHeight*outputWidth); + + for(t = 0; t < T; t++) + { + THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t); + THTensor *finput_t = THTensor_(newSelect)(finput, 0, t); + long i; +#pragma omp parallel for private(i) + for(i = 0; i < nInputPlane; i++) + { + THTensor *finput_i = THTensor_(newSelect)(finput_t, 0, i); + THTensor *gradOutput_i = THTensor_(newSelect)(gradOutput_t, 0, i); + THTensor *gradWeight_i = THTensor_(newSelect)(gradWeight, 0, i); + THTensor *gradBias_i = NULL; + if(gradBias) { + gradBias_i = THTensor_(newSelect)(gradBias, 0, i); + } + THNN_(SpatialDepthWiseConvolution_accGradParameters_frame)(gradOutput_i, gradWeight_i, + gradBias_i, finput_i, scale); + + THTensor_(free)(finput_i); + THTensor_(free)(gradOutput_i); + THTensor_(free)(gradWeight_i); + THTensor_(free)(gradBias_i); + } + + THTensor_(free)(gradOutput_t); + THTensor_(free)(finput_t); + } + + // Copy back and transpose back + THTensor_(transpose)(_gradWeight, NULL, 0, 1); + THTensor_(resize4d)(_gradWeight, nInputPlane, nOutputPlane, kH, kW); + THTensor_(copy)(_gradWeight, gradWeight); + THTensor_(transpose)(_gradWeight, NULL, 0, 1); + + if(gradBias) { + THTensor_(transpose)(_gradBias, NULL, 0, 1); + THTensor_(resize2d)(_gradBias, nInputPlane, nOutputPlane); + THTensor_(copy)(_gradBias, gradBias); + THTensor_(transpose)(_gradBias, NULL, 0, 1); + } + + if (batch == 0) { + THTensor_(select)(gradOutput, NULL, 0, 0); + THTensor_(select)(input, NULL, 0, 0); + THTensor_(select)(finput, NULL, 0, 0); + } + + THTensor_(free)(input); + THTensor_(free)(gradOutput); + THTensor_(free)(gradWeight); + THTensor_(free)(gradBias); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialDilatedConvolution.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialDilatedConvolution.c new file mode 100644 index 000000000..897cc0da4 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialDilatedConvolution.c @@ -0,0 +1,408 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SpatialDilatedConvolution.c" +#else + +static inline void THNN_(SpatialDilatedConvolution_shapeCheck)( + THTensor *input, THTensor *gradOutput, + THTensor *weight, THTensor *bias, + int kH, int kW, int dH, int dW, int padH, int padW, + int dilationH, int dilationW) { + + THNN_ARGCHECK(weight->nDimension == 4, 4, weight, + "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, " + "but got: %s"); + THArgCheck(kW > 0 && kH > 0, 9, + "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW); + THArgCheck(dW > 0 && dH > 0, 11, + "stride should be greater than zero, but got dH: %d dW: %d", dH, dW); + THArgCheck(dilationW > 0 && dilationH > 0, 15, + "dilation should be greater than zero, but got dilationH: %d, dilationW: %d", + dilationH, dilationW); + + if (bias != NULL) { + THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]); + } + + int ndim = input->nDimension; + int dimf = 0; + int dimh = 1; + int dimw = 2; + + if (ndim == 4) { + dimf++; + dimh++; + dimw++; + } + + THNN_ARGCHECK(ndim == 3 || ndim == 4, 2, input, + "3D or 4D input tensor expected but got: %s"); + + long nInputPlane = weight->size[1]; + long inputHeight = input->size[dimh]; + long inputWidth = input->size[dimw]; + long nOutputPlane = weight->size[0]; + long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; + long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; + + if (outputWidth < 1 || outputHeight < 1) + THError("Given input size: (%ld x %ld x %ld). " + "Calculated output size: (%ld x %ld x %ld). Output size is too small", + nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth); + + THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane); + + if (gradOutput != NULL) { + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth); + } +} + +void THNN_(SpatialDilatedConvolution_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + THTensor *columns, + THTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH) +{ + + THNN_(SpatialDilatedConvolution_shapeCheck) + (input, NULL, weight, bias, kH, kW, dH, dW, padH, padW, + dilationH, dilationW); + + // Params: + int nInputPlane = weight->size[1]; + int nOutputPlane = weight->size[0]; + + input = THTensor_(newContiguous)(input); + weight = THTensor_(newContiguous)(weight); + bias = bias ? THTensor_(newContiguous)(bias) : bias; + int batch = 1; + if (input->nDimension == 3) { + // Force batch + batch = 0; + THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]); + } + long inputWidth = input->size[3]; + long inputHeight = input->size[2]; + long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; + long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; + + // Batch size + input planes + long batchSize = input->size[0]; + + // Resize output + THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth); + THTensor_(zero)(output); + + // Resize temporary columns + THTensor_(resize2d)(columns, nInputPlane*kW*kH, outputHeight*outputWidth); + + // Define a buffer of ones, for bias accumulation + // Note: this buffer can be shared with other modules, it only ever gets increased, + // and always contains ones. + if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { + // Resize plane and fill with ones... + THTensor_(resize2d)(ones, outputHeight, outputWidth); + THTensor_(fill)(ones, 1); + } + + // Helpers + THTensor *input_n = THTensor_(new)(); + THTensor *output_n = THTensor_(new)(); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per output: + THTensor_(select)(input_n, input, 0, elt); + THTensor_(select)(output_n, output, 0, elt); + + // Do Bias first: + // M,N,K are dims of matrix A and B + long m_ = nOutputPlane; + long n_ = outputHeight * outputWidth; + long k_ = 1; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + if (bias) { + THBlas_(gemm)( + 't', 'n', + n_, m_, k_, + 1, + THTensor_(data)(ones), k_, + THTensor_(data)(bias), k_, + 0, + THTensor_(data)(output_n), n_ + ); + } else { + THTensor_(zero)(output_n); + } + + // Extract columns: + THNN_(im2col)( + THTensor_(data)(input_n), + nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, + dilationH, dilationW, + THTensor_(data)(columns) + ); + + // M,N,K are dims of matrix A and B + long m = nOutputPlane; + long n = columns->size[1]; + long k = nInputPlane*kH*kW; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + THBlas_(gemm)( + 'n', 'n', + n, m, k, + 1, + THTensor_(data)(columns), n, + THTensor_(data)(weight), k, + 1, + THTensor_(data)(output_n), n + ); + } + + // Free + THTensor_(free)(input_n); + THTensor_(free)(output_n); + + // Resize output + if (batch == 0) { + THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth); + THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth); + } + + THTensor_(free)(input); + THTensor_(free)(weight); + if (bias) THTensor_(free)(bias); +} + +void THNN_(SpatialDilatedConvolution_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *gradColumns, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH) +{ + THNN_(SpatialDilatedConvolution_shapeCheck) + (input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW, + dilationH, dilationW); + + // Params + int nInputPlane = weight->size[1]; + int nOutputPlane = weight->size[0]; + + input = THTensor_(newContiguous)(input); + weight = THTensor_(newContiguous)(weight); + gradOutput = THTensor_(newContiguous)(gradOutput); + int batch = 1; + if (input->nDimension == 3) { + // Force batch + batch = 0; + THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]); + THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], + gradOutput->size[2]); + } + + long inputWidth = input->size[3]; + long inputHeight = input->size[2]; + long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; + long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; + + // Batch size + input planes + long batchSize = input->size[0]; + + // Resize output + THTensor_(resize4d)(gradInput, batchSize, nInputPlane, inputHeight, inputWidth); + + // Resize temporary columns + THTensor_(resize2d)(gradColumns, nInputPlane*kW*kH, outputHeight*outputWidth); + THTensor_(zero)(gradColumns); + + // Helpers + THTensor *gradInput_n = THTensor_(new)(); + THTensor *gradOutput_n = THTensor_(new)(); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per sample: + THTensor_(select)(gradInput_n, gradInput, 0, elt); + THTensor_(select)(gradOutput_n, gradOutput, 0, elt); + + // M,N,K are dims of matrix A and B + long m = nInputPlane*kW*kH; + long n = gradColumns->size[1]; + long k = nOutputPlane; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + THBlas_(gemm)( + 'n', 't', + n, m, k, + 1, + THTensor_(data)(gradOutput_n), n, + THTensor_(data)(weight), m, + 0, + THTensor_(data)(gradColumns), n + ); + + // Unpack columns back into input: + THNN_(col2im)( + THTensor_(data)(gradColumns), + nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, + dilationH, dilationW, + THTensor_(data)(gradInput_n) + ); + } + + // Free + THTensor_(free)(gradInput_n); + THTensor_(free)(gradOutput_n); + + // Resize output + if (batch == 0) { + THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth); + THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth); + THTensor_(resize3d)(gradInput, nInputPlane, inputHeight, inputWidth); + } + + THTensor_(free)(input); + THTensor_(free)(gradOutput); + THTensor_(free)(weight); +} + + +void THNN_(SpatialDilatedConvolution_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *columns, + THTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH, + accreal scale_) +{ + real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); + THNN_(SpatialDilatedConvolution_shapeCheck) + (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW, + dilationH, dilationW); + + // Params + int nInputPlane = gradWeight->size[1]; + int nOutputPlane = gradWeight->size[0]; + + input = THTensor_(newContiguous)(input); + gradOutput = THTensor_(newContiguous)(gradOutput); + THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous"); + if (gradBias) + THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous"); + int batch = 1; + if (input->nDimension == 3) { + // Force batch + batch = 0; + THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]); + THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], + gradOutput->size[1], gradOutput->size[2]); + } + + long inputWidth = input->size[3]; + long inputHeight = input->size[2]; + long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; + long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; + + // Batch size + input planes + long batchSize = input->size[0]; + + // Define a buffer of ones, for bias accumulation + if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { + // Resize plane and fill with ones... + THTensor_(resize2d)(ones, outputHeight, outputWidth); + THTensor_(fill)(ones, 1); + } + + // Resize temporary columns + THTensor_(resize2d)(columns, nInputPlane*kW*kH, outputHeight*outputWidth); + + // Helpers + THTensor *input_n = THTensor_(new)(); + THTensor *gradOutput_n = THTensor_(new)(); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per output: + THTensor_(select)(input_n, input, 0, elt); + THTensor_(select)(gradOutput_n, gradOutput, 0, elt); + + // Extract columns: + THNN_(im2col)( + THTensor_(data)(input_n), + nInputPlane, inputHeight, inputWidth, kH, kW, padH, padW, dH, dW, + dilationH, dilationW, + THTensor_(data)(columns) + ); + + // M,N,K are dims of matrix A and B + long m = nOutputPlane; + long n = nInputPlane*kW*kH; + long k = columns->size[1]; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + THBlas_(gemm)( + 't', 'n', + n, m, k, + scale, + THTensor_(data)(columns), k, + THTensor_(data)(gradOutput_n), k, + 1, + THTensor_(data)(gradWeight), n + ); + + // Do Bias: + // M,N,K are dims of matrix A and B + long m_ = nOutputPlane; + long k_ = outputHeight * outputWidth; + + // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices) + if (gradBias) { + THBlas_(gemv)( + 't', + k_, m_, + scale, + THTensor_(data)(gradOutput_n), k_, + THTensor_(data)(ones), 1, + 1, + THTensor_(data)(gradBias), 1 + ); + } + } + + // Free + THTensor_(free)(input_n); + THTensor_(free)(gradOutput_n); + + // Resize + if (batch == 0) { + THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth); + THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth); + } + + THTensor_(free)(input); + THTensor_(free)(gradOutput); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialDilatedMaxPooling.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialDilatedMaxPooling.c new file mode 100644 index 000000000..8f4ad13c3 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialDilatedMaxPooling.c @@ -0,0 +1,401 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SpatialDilatedMaxPooling.c" +#else + +static inline void THNN_(SpatialDilatedMaxPooling_shapeCheck)( + THTensor *input, THTensor *gradOutput, THIndexTensor *indices, + int kH, int kW, int dH, int dW, int padH, int padW, + int dilationH, int dilationW, bool ceil_mode) { + + THArgCheck(kW > 0 && kH > 0, 5, + "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW); + THArgCheck(dW > 0 && dH > 0, 8, + "stride should be greater than zero, but got dH: %d dW: %d", dH, dW); + THArgCheck(dilationH > 0 && dilationW > 0, 12, + "dilation should be greater than zero, but got dilationH: %d dilationW: %d", + dilationH, dilationW); + + int ndim = input->nDimension; + int dimf = 0; + int dimh = 1; + int dimw = 2; + + if (ndim == 4) { + dimf++; + dimh++; + dimw++; + } + + THNN_ARGCHECK(ndim == 3 || ndim == 4, 2, input, + "3D or 4D input tensor expected but got: %s"); + + THArgCheck(kW/2 >= padW && kH/2 >= padH, 2, + "pad should be smaller than half of kernel size, but got " + "padW = %d, padH = %d, kW = %d, kH = %d", + padW, padH, kW, kH); + + long nInputPlane = input->size[dimh-1]; + long inputHeight = input->size[dimh]; + long inputWidth = input->size[dimw]; + long outputHeight, outputWidth; + long nOutputPlane = nInputPlane; + + if (ceil_mode) + { + outputHeight = (long)(ceil((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1; + outputWidth = (long)(ceil((float)(inputWidth - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1; + } + else + { + outputHeight = (long)(floor((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1; + outputWidth = (long)(floor((float)(inputWidth - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1; + } + + if (padW || padH) + { + // ensure that the last pooling starts inside the image + // needed to avoid problems in ceil mode + if ((outputHeight - 1)*dH >= inputHeight + padH) + --outputHeight; + if ((outputWidth - 1)*dW >= inputWidth + padW) + --outputWidth; + } + + if (outputWidth < 1 || outputHeight < 1) + THError("Given input size: (%dx%dx%d). " + "Calculated output size: (%dx%dx%d). Output size is too small", + nInputPlane,inputHeight,inputWidth,nInputPlane,outputHeight,outputWidth); + + if (gradOutput != NULL) { + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth); + } + if (indices != NULL) { + THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimf, nOutputPlane); + THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimh, outputHeight); + THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimw, outputWidth); + } +} + +static void THNN_(SpatialDilatedMaxPooling_updateOutput_frame)( + real *input_p, + real *output_p, + THIndex_t *ind_p, + long nslices, + long iwidth, + long iheight, + long owidth, + long oheight, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH, + int dilationW, + int dilationH + ) +{ + long k; +#pragma omp parallel for private(k) + for (k = 0; k < nslices; k++) + { + /* loop over output */ + long i, j; + real *ip = input_p + k*iwidth*iheight; + for(i = 0; i < oheight; i++) + { + for(j = 0; j < owidth; j++) + { + long hstart = i * dH - padH; + long wstart = j * dW - padW; + long hend = fminf(hstart + (kH - 1) * dilationH + 1, iheight); + long wend = fminf(wstart + (kW - 1) * dilationW + 1, iwidth); + while(hstart < 0) + hstart += dilationH; + while(wstart < 0) + wstart += dilationW; + + /* local pointers */ + real *op = output_p + k*owidth*oheight + i*owidth + j; + THIndex_t *indp = ind_p + k*owidth*oheight + i*owidth + j; + + /* compute local max: */ + long maxindex = -1; + real maxval = -THInf; + long tcntr = 0; + long x,y; + for(y = hstart; y < hend; y += dilationH) + { + for(x = wstart; x < wend; x += dilationW) + { + tcntr = y*iwidth + x; + real val = *(ip + tcntr); + if (val > maxval) + { + maxval = val; + maxindex = tcntr; + } + } + } + + /* set output to local max */ + *op = maxval; + + /* store location of max */ + *indp = maxindex + TH_INDEX_BASE; + } + } + } +} + +void THNN_(SpatialDilatedMaxPooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THIndexTensor *indices, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH, + int dilationW, + int dilationH, + bool ceil_mode) +{ + + int dimw = 2; + int dimh = 1; + long nbatch = 1; + long nInputPlane; + long inputHeight; + long inputWidth; + long outputHeight; + long outputWidth; + real *input_data; + real *output_data; + THIndex_t *indices_data; + + THNN_(SpatialDilatedMaxPooling_shapeCheck) + (input, NULL, NULL, kH, kW, dH, dW, + padH, padW, dilationH, dilationW, ceil_mode); + + if (input->nDimension == 4) + { + nbatch = input->size[0]; + dimw++; + dimh++; + } + + /* sizes */ + nInputPlane = input->size[dimh-1]; + inputHeight = input->size[dimh]; + inputWidth = input->size[dimw]; + if (ceil_mode) + { + outputHeight = (long)(ceil((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1; + outputWidth = (long)(ceil((float)(inputWidth - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1; + } + else + { + outputHeight = (long)(floor((float)(inputHeight - (dilationH * (kH - 1) + 1) + 2*padH) / dH)) + 1; + outputWidth = (long)(floor((float)(inputWidth - (dilationW * (kW - 1) + 1) + 2*padW) / dW)) + 1; + } + + if (padW || padH) + { + // ensure that the last pooling starts inside the image + // needed to avoid problems in ceil mode + if ((outputHeight - 1)*dH >= inputHeight + padH) + --outputHeight; + if ((outputWidth - 1)*dW >= inputWidth + padW) + --outputWidth; + } + + /* get contiguous input */ + input = THTensor_(newContiguous)(input); + + /* resize output */ + if (input->nDimension == 3) + { + THTensor_(resize3d)(output, nInputPlane, outputHeight, outputWidth); + /* indices will contain the locations for each output point */ + THIndexTensor_(resize3d)(indices, nInputPlane, outputHeight, outputWidth); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + indices_data = THIndexTensor_(data)(indices); + + THNN_(SpatialDilatedMaxPooling_updateOutput_frame) + (input_data, output_data, + indices_data, + nInputPlane, + inputWidth, inputHeight, + outputWidth, outputHeight, + kW, kH, dW, dH, + padW, padH, + dilationW, dilationH + ); + } + else + { + long p; + + THTensor_(resize4d)(output, nbatch, nInputPlane, outputHeight, outputWidth); + /* indices will contain the locations for each output point */ + THIndexTensor_(resize4d)(indices, nbatch, nInputPlane, outputHeight, outputWidth); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + indices_data = THIndexTensor_(data)(indices); + +#pragma omp parallel for private(p) + for (p = 0; p < nbatch; p++) + { + THNN_(SpatialDilatedMaxPooling_updateOutput_frame) + (input_data+p*nInputPlane*inputWidth*inputHeight, + output_data+p*nInputPlane*outputWidth*outputHeight, + indices_data+p*nInputPlane*outputWidth*outputHeight, + nInputPlane, + inputWidth, inputHeight, + outputWidth, outputHeight, + kW, kH, dW, dH, + padW, padH, + dilationW, dilationH + ); + } + } + + /* cleanup */ + THTensor_(free)(input); +} + +static void THNN_(SpatialDilatedMaxPooling_updateGradInput_frame)( + real *gradInput_p, + real *gradOutput_p, + THIndex_t *ind_p, + long nInputPlane, + long inputWidth, + long inputHeight, + long outputWidth, + long outputHeight, + int dW, + int dH) +{ + long k; +#pragma omp parallel for private(k) + for (k = 0; k < nInputPlane; k++) + { + real *gradInput_p_k = gradInput_p + k*inputWidth*inputHeight; + real *gradOutput_p_k = gradOutput_p + k*outputWidth*outputHeight; + THIndex_t *ind_p_k = ind_p + k*outputWidth*outputHeight; + + /* calculate max points */ + long i, j; + for(i = 0; i < outputHeight; i++) + { + for(j = 0; j < outputWidth; j++) + { + /* retrieve position of max */ + long maxp = ind_p_k[i*outputWidth + j] - TH_INDEX_BASE; + if (maxp != -1) { + /* update gradient */ + gradInput_p_k[maxp] += gradOutput_p_k[i*outputWidth + j]; + } + } + } + } +} + +void THNN_(SpatialDilatedMaxPooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THIndexTensor *indices, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH, + int dilationW, + int dilationH, + bool ceil_mode) +{ + int dimw = 2; + int dimh = 1; + long nbatch = 1; + int nInputPlane; + int inputHeight; + int inputWidth; + int outputHeight; + int outputWidth; + real *gradInput_data; + real *gradOutput_data; + THIndex_t *indices_data; + + THNN_(SpatialDilatedMaxPooling_shapeCheck) + (input, gradOutput, indices, kH, kW, dH, dW, + padH, padW, dilationH, dilationW, ceil_mode); + + /* get contiguous gradOutput */ + gradOutput = THTensor_(newContiguous)(gradOutput); + + /* resize */ + THTensor_(resizeAs)(gradInput, input); + THTensor_(zero)(gradInput); + + if (input->nDimension == 4) { + nbatch = input->size[0]; + dimw++; + dimh++; + } + + /* sizes */ + nInputPlane = input->size[dimh-1]; + inputHeight = input->size[dimh]; + inputWidth = input->size[dimw]; + outputHeight = gradOutput->size[dimh]; + outputWidth = gradOutput->size[dimw]; + + /* get raw pointers */ + gradInput_data = THTensor_(data)(gradInput); + gradOutput_data = THTensor_(data)(gradOutput); + indices_data = THIndexTensor_(data)(indices); + + /* backprop */ + if (input->nDimension == 3) + { + THNN_(SpatialDilatedMaxPooling_updateGradInput_frame) + (gradInput_data, gradOutput_data, + indices_data, + nInputPlane, + inputWidth, inputHeight, + outputWidth, outputHeight, + dW, dH); + } + else + { + long p; +#pragma omp parallel for private(p) + for (p = 0; p < nbatch; p++) + { + THNN_(SpatialDilatedMaxPooling_updateGradInput_frame) + (gradInput_data+p*nInputPlane*inputWidth*inputHeight, + gradOutput_data+p*nInputPlane*outputWidth*outputHeight, + indices_data+p*nInputPlane*outputWidth*outputHeight, + nInputPlane, + inputWidth, inputHeight, + outputWidth, outputHeight, + dW, dH); + } + } + + /* cleanup */ + THTensor_(free)(gradOutput); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialFractionalMaxPooling.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialFractionalMaxPooling.c new file mode 100644 index 000000000..a98954cc6 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialFractionalMaxPooling.c @@ -0,0 +1,253 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SpatialFractionalMaxPooling.c" +#else + +static long* THNN_(SpatialFractionalMaxPooling_generateIntervals)( + real sample, + long inputSize, + long outputSize, + int poolSize) { + real alpha = (real) (inputSize - poolSize) / (real) (outputSize - 1); + long* sequence = (long*) THAlloc(sizeof(long) * outputSize); + + long i; + for (i = 0; i < outputSize - 1; ++i) { + sequence[i] = + (long) ((i + sample) * alpha) - (long) (sample * alpha); + } + sequence[outputSize - 1] = inputSize - poolSize; + + return sequence; +} + +static void THNN_(SpatialFractionalMaxPooling_updateOutput_frame)( + real* input, + real* output, + THIndex_t* indices, + real* randomSamples, + long numPlanes, + long inputW, long inputH, + long outputW, long outputH, + int poolSizeW, int poolSizeH) { + long plane; +#pragma omp parallel for private(plane) + for (plane = 0; plane < numPlanes; ++plane) { + /* each plane contains 2 random samples, one for W and one for H */ + real* randomSamplesForPlane = randomSamples + plane * 2; + + /* Generate interval sequence */ + long* sequenceW = + THNN_(SpatialFractionalMaxPooling_generateIntervals)( + randomSamplesForPlane[0], inputW, outputW, poolSizeW); + long* sequenceH = + THNN_(SpatialFractionalMaxPooling_generateIntervals)( + randomSamplesForPlane[1], inputH, outputH, poolSizeH); + + /* loop over output */ + long h, w; + + real* inputForPlane = input + plane * inputW * inputH; + real* outputForPlane = output + plane * outputW * outputH; + THIndex_t* indicesForPlane = indices + plane * outputW * outputH; + + for (h = 0; h < outputH; ++h) { + long inputHStart = sequenceH[h]; + + for (w = 0; w < outputW; ++w) { + long inputWStart = sequenceW[w]; + + real maxVal = -THInf; + long maxIndex = -1; + + long h2, w2; + for (h2 = inputHStart; h2 < inputHStart + poolSizeH; ++h2) { + for (w2 = inputWStart; w2 < inputWStart + poolSizeW; ++w2) { + THAssert(h2 >= 0 && h2 < inputH); + THAssert(w2 >= 0 && w2 < inputW); + + long planeIndex = h2 * inputW + w2; + real val = inputForPlane[planeIndex]; + if (val > maxVal) { + maxVal = val; + maxIndex = planeIndex; + } + } + } + + THAssert(maxVal != -THInf); + THAssert(maxIndex != -1); + + outputForPlane[h * outputW + w] = maxVal; + /* +1 to lua index */ + indicesForPlane[h * outputW + w] = maxIndex + TH_INDEX_BASE; + } + } + + THFree(sequenceW); + THFree(sequenceH); + } +} + +void THNN_(SpatialFractionalMaxPooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int outputW, int outputH, + int poolSizeW, int poolSizeH, + THIndexTensor *indices, + THTensor *randomSamples) { + + long numBatch = 1; + int planeDim = 0; + int heightDim = 1; + int widthDim = 2; + + long numInputDims = THTensor_(nDimension)(input); + THNN_ARGCHECK(numInputDims == 3 || numInputDims == 4, 2, input, + "3D or 4D (batch mode) tensor expected for input, but got: %s"); + + if (numInputDims == 4) { + numBatch = THTensor_(size)(input, 0); + planeDim++; + heightDim++; + widthDim++; + } + + /* sizes */ + long numPlanes = THTensor_(size)(input, planeDim); + long inputH = THTensor_(size)(input, heightDim); + long inputW = THTensor_(size)(input, widthDim); + + THArgCheck(outputH + poolSizeH - 1 < inputH, 7, + "poolSizeH (%d) too large relative to input height (%d)", + poolSizeH, inputH); + THArgCheck(outputW + poolSizeW - 1 < inputW, 6, + "poolSizeW (%d) too large relative to input width (%d)", + poolSizeW, inputW); + + /* get contiguous input */ + input = THTensor_(newContiguous)(input); + + if (numInputDims == 3) { + /* resize output */ + THTensor_(resize3d)(output, numPlanes, outputH, outputW); + /* indices will contain the locations for each output point */ + THIndexTensor_(resize3d)(indices, numPlanes, outputH, outputW); + + THNN_(SpatialFractionalMaxPooling_updateOutput_frame)( + THTensor_(data)(input), + THTensor_(data)(output), + THIndexTensor_(data)(indices), + THTensor_(data)(randomSamples), + numPlanes, inputW, inputH, outputW, outputH, poolSizeW, poolSizeH); + } else { + THTensor_(resize4d)(output, numBatch, numPlanes, outputH, outputW); + /* indices will contain the locations for each output point */ + THIndexTensor_(resize4d)(indices, numBatch, numPlanes, outputH, outputW); + + long batch; +#pragma omp parallel for private(batch) + for (batch = 0; batch < numBatch; ++batch) { + THNN_(SpatialFractionalMaxPooling_updateOutput_frame)( + THTensor_(data)(input) + batch * numPlanes * inputH * inputW, + THTensor_(data)(output) + batch * numPlanes * outputH * outputW, + THIndexTensor_(data)(indices) + batch * numPlanes * outputH * outputW, + THTensor_(data)(randomSamples) + batch * numPlanes * 2, + numPlanes, inputW, inputH, outputW, outputH, poolSizeW, poolSizeH); + } + } + + /* cleanup */ + THTensor_(free)(input); +} + +static void THNN_(SpatialFractionalMaxPooling_updateGradInput_frame)( + real* gradInput, + real* gradOutput, + THIndex_t* indices, + long numPlanes, + long inputW, long inputH, + long outputW, long outputH) { + long plane; +#pragma omp parallel for private(plane) + for (plane = 0; plane < numPlanes; plane++) { + real* gradInputForPlane = gradInput + plane * inputW * inputH; + real* gradOutputForPlane = gradOutput + plane * outputW * outputH; + THIndex_t* indicesForPlane = indices + plane * outputW * outputH; + + long h, w; + for (h = 0; h < outputH; ++h) { + for (w = 0; w < outputW; ++w) { + long outputIndex = h * outputW + w; + long index = indicesForPlane[outputIndex] - TH_INDEX_BASE; + THAssert(index >= 0 && index < inputW * inputH); + + gradInputForPlane[index] += gradOutputForPlane[outputIndex]; + } + } + } +} + +void THNN_(SpatialFractionalMaxPooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + int outputW, int outputH, + int poolSizeW, int poolSizeH, + THIndexTensor *indices) { + + long numBatch = 1; + int planeDim = 0; + int heightDim = 1; + int widthDim = 2; + + long numInputDims = THTensor_(nDimension)(input); + if (numInputDims == 4) { + numBatch = THTensor_(size)(input, 0); + planeDim = 1; + heightDim++; + widthDim++; + } + + /* sizes */ + long numPlanes = THTensor_(size)(input, planeDim); + long inputH = THTensor_(size)(input, heightDim); + long inputW = THTensor_(size)(input, widthDim); + + THArgCheck(outputW == THTensor_(size)(gradOutput, widthDim), 3, + "gradOutput width unexpected"); + THArgCheck(outputH == THTensor_(size)(gradOutput, heightDim), 3, + "gradOutput height unexpected"); + + /* get contiguous gradOutput */ + gradOutput = THTensor_(newContiguous)(gradOutput); + + /* resize */ + THTensor_(resizeAs)(gradInput, input); + THTensor_(zero)(gradInput); + + /* backprop */ + if (numInputDims == 3) { + THNN_(SpatialFractionalMaxPooling_updateGradInput_frame)( + THTensor_(data)(gradInput), + THTensor_(data)(gradOutput), + THIndexTensor_(data)(indices), + numPlanes, inputW, inputH, outputW, outputH); + } else { + long batch; +#pragma omp parallel for private(batch) + for (batch = 0; batch < numBatch; ++batch) { + THNN_(SpatialFractionalMaxPooling_updateGradInput_frame)( + THTensor_(data)(gradInput) + batch * numPlanes * inputH * inputW, + THTensor_(data)(gradOutput) + batch * numPlanes * outputH * outputW, + THIndexTensor_(data)(indices) + batch * numPlanes * outputH * outputW, + numPlanes, inputW, inputH, outputW, outputH); + } + } + + /* cleanup */ + THTensor_(free)(gradOutput); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialFullConvolution.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialFullConvolution.c new file mode 100644 index 000000000..2edc53b5a --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialFullConvolution.c @@ -0,0 +1,462 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SpatialFullConvolution.c" +#else + +static void THNN_(im2col)(const real* data_im, const int channels, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + real* data_col) { + const int height_col = (height + 2 * pad_h - + (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; + const int width_col = (width + 2 * pad_w - + (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; + const int channels_col = channels * kernel_h * kernel_w; + for (int c_col = 0; c_col < channels_col; ++c_col) { + int w_offset = c_col % kernel_w; + int h_offset = (c_col / kernel_w) % kernel_h; + int c_im = c_col / kernel_h / kernel_w; + for (int h_col = 0; h_col < height_col; ++h_col) { + for (int w_col = 0; w_col < width_col; ++w_col) { + int h_im = h_col * stride_h - pad_h + h_offset * dilation_h; + int w_im = w_col * stride_w - pad_w + w_offset * dilation_w; + data_col[(c_col * height_col + h_col) * width_col + w_col] = + (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) ? + data_im[(c_im * height + h_im) * width + w_im] : 0; + } + } + } +} + +static void THNN_(col2im)(const real* data_col, const int channels, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, + const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + real* data_im) { + memset(data_im, 0, sizeof(real) * height * width * channels); + const int height_col = (height + 2 * pad_h - + (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; + const int width_col = (width + 2 * pad_w - + (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; + const int channels_col = channels * kernel_h * kernel_w; + for (int c_col = 0; c_col < channels_col; ++c_col) { + int w_offset = c_col % kernel_w; + int h_offset = (c_col / kernel_w) % kernel_h; + int c_im = c_col / kernel_h / kernel_w; + for (int h_col = 0; h_col < height_col; ++h_col) { + for (int w_col = 0; w_col < width_col; ++w_col) { + int h_im = h_col * stride_h - pad_h + h_offset * dilation_h; + int w_im = w_col * stride_w - pad_w + w_offset * dilation_w; + if (h_im >= 0 && h_im < height && w_im >= 0 && w_im < width) + data_im[(c_im * height + h_im) * width + w_im] += + data_col[(c_col * height_col + h_col) * width_col + w_col]; + } + } + } +} + +static inline void THNN_(SpatialFullConvolution_shapeCheck)( + THTensor *input, THTensor *gradOutput, + THTensor *weight, THTensor *bias, + int kH, int kW, int dH, int dW, int padH, int padW, int adjH, int adjW) { + + THArgCheck(kW > 0 && kH > 0, 9, + "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW); + THArgCheck(dW > 0 && dH > 0, 11, + "stride should be greater than zero, but got dH: %d dW: %d", dH, dW); + THArgCheck(adjW < dW && adjH < dH, 15, + "output adjustment must be smaller than stride, but got adjH: %d adjW: %d dH: %d dW: %d", + adjH, adjW, dH, dW); + THNN_ARGCHECK(weight->nDimension == 2 || weight->nDimension == 4, 5, weight, + "2D or 4D weight tensor expected, but got: %s"); + + if (bias != NULL) { + THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[1]); + } + + int ndim = input->nDimension; + int dimf = 0; + int dimh = 1; + int dimw = 2; + + if (ndim == 4) { + dimf++; + dimh++; + dimw++; + } + + THNN_ARGCHECK(ndim == 3 || ndim == 4, 2, input, + "3D or 4D input tensor expected but got: %s"); + + long nInputPlane = weight->size[0]; + long inputHeight = input->size[dimh]; + long inputWidth = input->size[dimw]; + long nOutputPlane = weight->size[1]; + long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH; + long outputWidth = (inputWidth - 1) * dW - 2*padW + kW + adjW; + + if (outputWidth < 1 || outputHeight < 1) + THError("Given input size: (%d x %d x %d). " + "Calculated output size: (%d x %d x %d). Output size is too small", + nInputPlane,inputHeight,inputWidth,nOutputPlane,outputHeight,outputWidth); + + THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane); + + if (gradOutput != NULL) { + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth); + } +} + +void THNN_(SpatialFullConvolution_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + THTensor *columns, + THTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int adjW, int adjH) +{ + THNN_(SpatialFullConvolution_shapeCheck) + (input, NULL, weight, bias, kH, kW, dH, dW, padH, padW, adjH, adjW); + + int nInputPlane = THTensor_(size)(weight,0); + int nOutputPlane = THTensor_(size)(weight,1); + + input = THTensor_(newContiguous)(input); + weight = THTensor_(newContiguous)(weight); + bias = bias ? THTensor_(newContiguous)(bias) : bias; + int batch = 1; + if (input->nDimension == 3) { + // Force batch + batch = 0; + THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]); + } + + long inputHeight = input->size[2]; + long inputWidth = input->size[3]; + long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH; + long outputWidth = (inputWidth - 1) * dW - 2*padW + kW + adjW; + + // Batch size + input planes + long batchSize = input->size[0]; + + // Resize output + THTensor_(resize4d)(output, batchSize, nOutputPlane, outputHeight, outputWidth); + + // Resize temporary columns + THTensor_(resize2d)(columns, nOutputPlane*kW*kH, inputHeight*inputWidth); + THTensor_(zero)(columns); + + // Define a buffer of ones, for bias accumulation + // Note: this buffer can be shared with other modules, it only ever gets increased, + // and always contains ones. + if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { + // Resize plane and fill with ones... + THTensor_(resize2d)(ones, outputHeight, outputWidth); + THTensor_(fill)(ones, 1); + } + + // Helpers + THTensor *input_n = THTensor_(new)(); + THTensor *output_n = THTensor_(new)(); + + int elt; + // For each elt in batch, do: + for (elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per output: + THTensor_(select)(input_n, input, 0, elt); + THTensor_(select)(output_n, output, 0, elt); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + long m = weight->size[1] * weight->size[2] * weight->size[3]; + long n = columns->size[1]; + long k = weight->size[0]; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + THBlas_(gemm)( + 'n', 't', + n, m, k, + 1, + THTensor_(data)(input_n), n, + THTensor_(data)(weight), m, + 0, + THTensor_(data)(columns), n + ); + + // Unpack columns back into input: + THNN_(col2im)( + THTensor_(data)(columns), + nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW, + 1, 1, + THTensor_(data)(output_n) + ); + + // Do Bias after: + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + long m_ = nOutputPlane; + long n_ = outputHeight * outputWidth; + long k_ = 1; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + if (bias) { + THBlas_(gemm)( + 't', 'n', + n_, m_, k_, + 1, + THTensor_(data)(ones), k_, + THTensor_(data)(bias), k_, + 1, + THTensor_(data)(output_n), n_ + ); + } + } + + // Free + THTensor_(free)(input_n); + THTensor_(free)(output_n); + + // Resize output + if (batch == 0) { + THTensor_(resize3d)(output, nOutputPlane, outputHeight, outputWidth); + THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth); + } + + THTensor_(free)(input); + THTensor_(free)(weight); + if (bias) THTensor_(free)(bias); +} + +void THNN_(SpatialFullConvolution_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *gradColumns, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int adjW, int adjH) +{ + THNN_(SpatialFullConvolution_shapeCheck) + (input, gradOutput, weight, NULL, kH, kW, dH, dW, padH, padW, adjH, adjW); + + int nInputPlane = THTensor_(size)(weight,0); + int nOutputPlane = THTensor_(size)(weight,1); + + input = THTensor_(newContiguous)(input); + gradOutput = THTensor_(newContiguous)(gradOutput); + weight = THTensor_(newContiguous)(weight); + int batch = 1; + if (input->nDimension == 3) { + // Force batch + batch = 0; + THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]); + THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); + } + + long inputWidth = input->size[3]; + long inputHeight = input->size[2]; + long outputWidth = (inputWidth - 1) * dW - 2*padW + kW + adjW; + long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH; + + // Batch size + input planes + long batchSize = input->size[0]; + + // Resize output + THTensor_(resize4d)(gradInput, batchSize, nInputPlane, inputHeight, inputWidth); + THTensor_(zero)(gradInput); + + // Resize temporary columns + THTensor_(resize2d)(gradColumns, nOutputPlane*kW*kH, inputHeight*inputWidth); + + // Helpers + THTensor *gradInput_n = THTensor_(new)(); + THTensor *gradOutput_n = THTensor_(new)(); + + int elt; + // For each elt in batch, do: + for (elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per sample: + THTensor_(select)(gradInput_n, gradInput, 0, elt); + THTensor_(select)(gradOutput_n, gradOutput, 0, elt); + + // Extract columns: + THNN_(im2col)( + THTensor_(data)(gradOutput_n), + nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW, + 1, 1, + THTensor_(data)(gradColumns) + ); + + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + long m = weight->size[0]; + long n = gradColumns->size[1]; + long k = weight->size[1] * weight->size[2] * weight->size[3]; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + THBlas_(gemm)( + 'n', 'n', + n, m, k, + 1, + THTensor_(data)(gradColumns), n, + THTensor_(data)(weight), k, + 0, + THTensor_(data)(gradInput_n), n + ); + } + + + // Free + THTensor_(free)(gradInput_n); + THTensor_(free)(gradOutput_n); + + // Resize output + if (batch == 0) { + THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth); + THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth); + THTensor_(resize3d)(gradInput, nInputPlane, inputHeight, inputWidth); + } + + THTensor_(free)(input); + THTensor_(free)(gradOutput); + THTensor_(free)(weight); +} + + +void THNN_(SpatialFullConvolution_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *columns, + THTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int adjW, int adjH, + accreal scale_) +{ + real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); + THNN_(SpatialFullConvolution_shapeCheck) + (input, gradOutput, gradWeight, gradBias, kH, kW, dH, dW, padH, padW, adjH, adjW); + + int nInputPlane = THTensor_(size)(gradWeight,0); + int nOutputPlane = THTensor_(size)(gradWeight,1); + + input = THTensor_(newContiguous)(input); + gradOutput = THTensor_(newContiguous)(gradOutput); + THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous"); + if (gradBias) + THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous"); + int batch = 1; + if (input->nDimension == 3) { + // Force batch + batch = 0; + THTensor_(resize4d)(input, 1, input->size[0], input->size[1], input->size[2]); + THTensor_(resize4d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]); + } + + long inputWidth = input->size[3]; + long inputHeight = input->size[2]; + long outputWidth = (inputWidth - 1) * dW - 2*padW + kW + adjW; + long outputHeight = (inputHeight - 1) * dH - 2*padH + kH + adjH; + + // Batch size + input planes + long batchSize = input->size[0]; + + // Define a buffer of ones, for bias accumulation + if (ones->nDimension != 2 || ones->size[0]*ones->size[1] < outputHeight*outputWidth) { + // Resize plane and fill with ones... + THTensor_(resize2d)(ones, outputHeight, outputWidth); + THTensor_(fill)(ones, 1); + } + + // Resize temporary columns + THTensor_(resize2d)(columns, nOutputPlane*kW*kH, inputHeight*inputWidth); + + // Helpers + THTensor *input_n = THTensor_(new)(); + THTensor *gradOutput_n = THTensor_(new)(); + + int elt; + // For each elt in batch, do: + for (elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per output: + THTensor_(select)(input_n, input, 0, elt); + THTensor_(select)(gradOutput_n, gradOutput, 0, elt); + + // Extract columns: + THNN_(im2col)( + THTensor_(data)(gradOutput_n), + nOutputPlane, outputHeight, outputWidth, kH, kW, padH, padW, dH, dW, + 1, 1, + THTensor_(data)(columns) + ); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + long n = columns->size[0]; // nOutputPlane * kh * kw + long m = input_n->size[0]; // nInputPlane + long k = columns->size[1]; // inputHeight * inputWidth + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + THBlas_(gemm)( + 't', 'n', + n, m, k, + scale, + THTensor_(data)(columns), k, + THTensor_(data)(input_n), k, + 1, + THTensor_(data)(gradWeight), n + ); + + + // Do Bias: + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + long m_ = nOutputPlane; + long k_ = outputHeight * outputWidth; + + // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices) + if (gradBias) { + THBlas_(gemv)( + 't', + k_, m_, + scale, + THTensor_(data)(gradOutput_n), k_, + THTensor_(data)(ones), 1, + 1, + THTensor_(data)(gradBias), 1 + ); + } + } + + // Free + THTensor_(free)(input_n); + THTensor_(free)(gradOutput_n); + + // Resize + if (batch == 0) { + THTensor_(resize3d)(gradOutput, nOutputPlane, outputHeight, outputWidth); + THTensor_(resize3d)(input, nInputPlane, inputHeight, inputWidth); + } + + THTensor_(free)(input); + THTensor_(free)(gradOutput); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialFullConvolutionMap.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialFullConvolutionMap.c new file mode 100644 index 000000000..6952fbe25 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialFullConvolutionMap.c @@ -0,0 +1,222 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SpatialFullConvolutionMap.c" +#else + +void THNN_(SpatialFullConvolutionMap_updateOutput)( + THNNState *state, THTensor *input, THTensor *output_, THTensor *weight, THTensor *bias, + THTensor *connTable, int nInputPlane, int nOutputPlane, + int dW, int dH) +{ + THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous"); + THArgCheck(!bias || THTensor_(isContiguous)(bias), 5, "bias must be contiguous"); + THArgCheck( + weight != NULL && weight->nDimension == 3 + && connTable != NULL && connTable->size[0] == weight->size[0], 4, + "3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE + ); + + const int kH = (int)weight->size[1]; + const int kW = (int)weight->size[2]; + + THArgCheck(input != NULL && input->nDimension == 3, 2, "3D tensor expected"); + THArgCheck(input->size[0] >= nInputPlane, 2, "invalid number of input planes"); + + THTensor_(resize3d)( + output_, nOutputPlane, + (input->size[1] - 1) * dH + kH, + (input->size[2] - 1) * dW + kW + ); + + /* contiguous */ + input = THTensor_(newContiguous)(input); + THTensor* output = THTensor_(newContiguous)(output_); + + /* get raw pointers */ + real *input_data = THTensor_(data)(input); + real *output_data = THTensor_(data)(output); + real *weight_data = THTensor_(data)(weight); + real *bias_data = THTensor_(data)(bias); + real *connTable_data = THTensor_(data)(connTable); + + /* and dims */ + const long input_h = input->size[1]; + const long input_w = input->size[2]; + const long output_h = output->size[1]; + const long output_w = output->size[2]; + const long weight_h = weight->size[1]; + const long weight_w = weight->size[2]; + + long p; +#pragma omp parallel for private(p) + for (p = 0; p < nOutputPlane; p++) + { + /* add bias */ + real *ptr_output = output_data + p*output_w*output_h; + long j; + int nweight; + long k; + + for (j = 0; j < output_h*output_w; j++) + ptr_output[j] = bias_data[p]; + + /* convolve all maps */ + nweight = connTable->size[0]; + for (k = 0; k < nweight; k++) + { + /* get offsets for input/output */ + int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE; + int i = (int)connTable_data[k*2+0] - TH_INDEX_BASE; + + if (o == p) + { + THTensor_(fullConv2Dptr)( + output_data + o*output_w*output_h, + 1.0, + input_data + i*input_w*input_h, input_h, input_w, + weight_data + k*weight_w*weight_h, weight_h, weight_w, + dH, dW + ); + } + } + } + + /* clean up */ + THTensor_(free)(input); + THTensor_(freeCopyTo)(output, output_); +} + +void THNN_(SpatialFullConvolutionMap_updateGradInput)( + THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradInput_, THTensor *weight, THTensor *bias, + THTensor *connTable, int nInputPlane, int nOutputPlane, + int dW, int dH) +{ + THArgCheck( + weight != NULL && weight->nDimension == 3 + && connTable != NULL && connTable->size[0] == weight->size[0], 5, + "3D weight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE + ); + + /* contiguous */ + THTensor* gradInput = THTensor_(newContiguous)(gradInput_); + gradOutput = THTensor_(newContiguous)(gradOutput); + + /* Resize/Zero */ + THTensor_(resizeAs)(gradInput, input); + THTensor_(zero)(gradInput); + + /* get raw pointers */ + real *gradInput_data = THTensor_(data)(gradInput); + real *gradOutput_data = THTensor_(data)(gradOutput); + real *weight_data = THTensor_(data)(weight); + real *connTable_data = THTensor_(data)(connTable); + + /* and dims */ + const long input_h = input->size[1]; + const long input_w = input->size[2]; + const long output_h = gradOutput->size[1]; + const long output_w = gradOutput->size[2]; + const long kH = weight->size[1]; + const long kW = weight->size[2]; + + long p; +#pragma omp parallel for private(p) + for (p = 0; p < nInputPlane; p++) + { + long k; + /* backward all */ + int nkernel = connTable->size[0]; + for (k = 0; k < nkernel; k++) + { + int o = (int)connTable_data[k*2+1] - TH_INDEX_BASE; + int i = (int)connTable_data[k*2+0] - TH_INDEX_BASE; + if (i == p) + { + /* gradient to input */ + THTensor_(validXCorr2Dptr)( + gradInput_data + i*input_w*input_h, + 1.0, + gradOutput_data + o*output_w*output_h, output_h, output_w, + weight_data + k*kW*kH, kH, kW, + dH, dW + ); + } + } + } + + /* clean up */ + THTensor_(freeCopyTo)(gradInput, gradInput_); + THTensor_(free)(gradOutput); +} + +void THNN_(SpatialFullConvolutionMap_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *connTable, + int nInputPlane, + int nOutputPlane, + int dW, int dH, + accreal scale_) +{ + real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); + THArgCheck( + gradWeight != NULL && gradWeight->nDimension == 3 + && connTable != NULL && connTable->size[0] == gradWeight->size[0], 5, + "3D gradWeight tensor expected (connTable:size(%d) x kH x kW)", TH_INDEX_BASE + ); + + /* contiguous */ + input = THTensor_(newContiguous)(input); + gradOutput = THTensor_(newContiguous)(gradOutput); + + /* get raw pointers */ + real *input_data = THTensor_(data)(input); + real *gradOutput_data = THTensor_(data)(gradOutput); + real *gradWeight_data = THTensor_(data)(gradWeight); + real *gradBias_data = THTensor_(data)(gradBias); + + /* and dims */ + const long input_h = input->size[1]; + const long input_w = input->size[2]; + const long output_h = gradOutput->size[1]; + const long output_w = gradOutput->size[2]; + const long weight_h = gradWeight->size[1]; + const long weight_w = gradWeight->size[2]; + + /* gradients wrt bias */ + long k; +#pragma omp parallel for private(k) + for (k = 0; k < nOutputPlane; k++) + { + real *ptr_gradOutput = gradOutput_data + k*output_w*output_h; + long l; + for (l = 0; l < output_h*output_w; l++) + gradBias_data[k] += scale*ptr_gradOutput[l]; + } + + /* gradients wrt weight */ + int nkernel = connTable->size[0]; +#pragma omp parallel for private(k) + for (k = 0; k < nkernel; k++) + { + int o = (int)THTensor_(get2d)(connTable,k,1) - TH_INDEX_BASE; + int i = (int)THTensor_(get2d)(connTable,k,0) - TH_INDEX_BASE; + + /* gradient to kernel */ + THTensor_(validXCorr2DRevptr)( + gradWeight_data + k*weight_w*weight_h, + scale, + gradOutput_data + o*output_w*output_h, output_h, output_w, + input_data + i*input_w*input_h, input_h, input_w, + dH, dW + ); + } + + /* clean up */ + THTensor_(free)(input); + THTensor_(free)(gradOutput); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialMaxPooling.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialMaxPooling.c new file mode 100644 index 000000000..88aaa40e1 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialMaxPooling.c @@ -0,0 +1,44 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SpatialMaxPooling.c" +#else + +void THNN_(SpatialMaxPooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THIndexTensor *indices, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH, + bool ceil_mode) +{ + THNN_(SpatialDilatedMaxPooling_updateOutput)( + state, input, output, indices, + kW, kH, dW, dH, padW, padH, 1, 1, ceil_mode + ); +} + +void THNN_(SpatialMaxPooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THIndexTensor *indices, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH, + bool ceil_mode) +{ + THNN_(SpatialDilatedMaxPooling_updateGradInput)( + state, input, gradOutput, gradInput, indices, + kW, kH, dW, dH, padW, padH, 1, 1, ceil_mode + ); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialMaxUnpooling.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialMaxUnpooling.c new file mode 100644 index 000000000..320538686 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialMaxUnpooling.c @@ -0,0 +1,234 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SpatialMaxUnpooling.c" +#else + +static void THNN_(SpatialMaxUnpooling_updateOutput_frame)(real *input_p, real *output_p, + THIndex_t *ind_p, + int nslices, + int iwidth, int iheight, + int owidth, int oheight) +{ + int k; + int has_error = 0; + THIndex_t error_index; +#pragma omp parallel for private(k) + for (k = 0; k < nslices; k++) + { + real *output_p_k = output_p + k*owidth*oheight; + real *input_p_k = input_p + k*iwidth*iheight; + THIndex_t *ind_p_k = ind_p + k*iwidth*iheight; + + int i, j; + THIndex_t maxp; + for(i = 0; i < iheight; i++) + { + for(j = 0; j < iwidth; j++) + { + maxp = ind_p_k[i*iwidth + j] - TH_INDEX_BASE; /* retrieve position of max */ + if(maxp<0 || maxp>=owidth*oheight){ +#pragma omp critical + { + has_error = 1; + error_index = maxp; + } + } else { + output_p_k[maxp] = input_p_k[i*iwidth + j]; /* update output */ + } + } + } + } + if (has_error) { + THError("found an invalid max index %ld (output volumes are of size %dx%d)", + error_index, oheight, owidth); + } +} + +void THNN_(SpatialMaxUnpooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THIndexTensor *indices, + int owidth, int oheight) +{ + int dimw = 2; + int dimh = 1; + int nbatch = 1; + int nslices; + int iheight; + int iwidth; + real *input_data; + real *output_data; + THIndex_t *indices_data; + + + THNN_ARGCHECK(input->nDimension == 3 || input->nDimension == 4, 2, input, + "3D or 4D (batch mode) tensor expected for input, but got: %s"); + THNN_CHECK_SHAPE_INDICES(input, indices); + + if (input->nDimension == 4) + { + nbatch = input->size[0]; + dimw++; + dimh++; + } + + /* sizes */ + nslices = input->size[dimh-1]; + iheight = input->size[dimh]; + iwidth = input->size[dimw]; + + /* get contiguous input and indices */ + input = THTensor_(newContiguous)(input); + indices = THIndexTensor_(newContiguous)(indices); + + /* resize output */ + if (input->nDimension == 3) + { + THTensor_(resize3d)(output, nslices, oheight, owidth); + THTensor_(zero)(output); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + indices_data = THIndexTensor_(data)(indices); + + THNN_(SpatialMaxUnpooling_updateOutput_frame)(input_data, output_data, + indices_data, + nslices, + iwidth, iheight, + owidth, oheight); + } + else + { + int p; + + THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth); + THTensor_(zero)(output); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + indices_data = THIndexTensor_(data)(indices); + + for (p = 0; p < nbatch; p++) + { + THNN_(SpatialMaxUnpooling_updateOutput_frame)( + input_data+p*nslices*iwidth*iheight, + output_data+p*nslices*owidth*oheight, + indices_data+p*nslices*iwidth*iheight, + nslices, + iwidth, iheight, + owidth, oheight); + } + } + + /* cleanup */ + THTensor_(free)(input); + THIndexTensor_(free)(indices); +} + +static void THNN_(SpatialMaxUnpooling_updateGradInput_frame)(real *gradInput_p, real *gradOutput_p, + THIndex_t *ind_p, + int nslices, + int iwidth, int iheight, + int owidth, int oheight) +{ + int k; +#pragma omp parallel for private(k) + for (k = 0; k < nslices; k++) + { + real *gradInput_p_k = gradInput_p + k*iwidth*iheight; + real *gradOutput_p_k = gradOutput_p + k*owidth*oheight; + THIndex_t *ind_p_k = ind_p + k*iwidth*iheight; + + int i, j; + THIndex_t maxp; + for(i = 0; i < iheight; i++) + { + for(j = 0; j < iwidth; j++) + { + maxp = ind_p_k[i*iwidth + j] - TH_INDEX_BASE; /* retrieve position of max */ + if(maxp < 0 || maxp >= owidth * oheight) { + THError("invalid max index %ld, owidth= %d, oheight= %d", maxp, owidth, oheight); + } + gradInput_p_k[i*iwidth + j] = gradOutput_p_k[maxp]; /* update gradient */ + } + } + } +} + +void THNN_(SpatialMaxUnpooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THIndexTensor *indices, + int owidth, int oheight) +{ + int dimw = 2; + int dimh = 1; + int nbatch = 1; + int nslices; + int iheight; + int iwidth; + real *gradInput_data; + real *gradOutput_data; + THIndex_t *indices_data; + + THNN_CHECK_SHAPE_INDICES(input, indices); + + /* get contiguous gradOutput and indices */ + gradOutput = THTensor_(newContiguous)(gradOutput); + indices = THIndexTensor_(newContiguous)(indices); + + /* resize */ + THTensor_(resizeAs)(gradInput, input); + THTensor_(zero)(gradInput); + + if (input->nDimension == 4) { + nbatch = input->size[0]; + dimw++; + dimh++; + } + + /* sizes */ + nslices = input->size[dimh-1]; + iheight = input->size[dimh]; + iwidth = input->size[dimw]; + + if(owidth!=gradOutput->size[dimw] || oheight!=gradOutput->size[dimh]){ + THError("Inconsistent gradOutput size. oheight= %d, owidth= %d, gradOutput: %dx%d", + oheight, owidth, gradOutput->size[dimh], gradOutput->size[dimw]); + } + + /* get raw pointers */ + gradInput_data = THTensor_(data)(gradInput); + gradOutput_data = THTensor_(data)(gradOutput); + indices_data = THIndexTensor_(data)(indices); + + /* backprop */ + if (input->nDimension == 3) + { + THNN_(SpatialMaxUnpooling_updateGradInput_frame)(gradInput_data, gradOutput_data, + indices_data, + nslices, + iwidth, iheight, + owidth, oheight); + } + else + { + int p; + for (p = 0; p < nbatch; p++) + { + THNN_(SpatialMaxUnpooling_updateGradInput_frame)(gradInput_data+p*nslices*iwidth*iheight, gradOutput_data+p*nslices*owidth*oheight, + indices_data+p*nslices*iwidth*iheight, + nslices, + iwidth, iheight, + owidth, oheight); + } + } + + /* cleanup */ + THTensor_(free)(gradOutput); + THIndexTensor_(free)(indices); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialReflectionPadding.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialReflectionPadding.c new file mode 100644 index 000000000..dcde660ea --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialReflectionPadding.c @@ -0,0 +1,260 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SpatialReflectionPadding.c" +#else + +static void THNN_(SpatialReflectionPadding_updateOutput_frame)( + real *input_p, real *output_p, + long nslices, + long iwidth, long iheight, + long owidth, long oheight, + int pad_l, int pad_r, + int pad_t, int pad_b) +{ + int iStartX = fmax(0, -pad_l); + int iStartY = fmax(0, -pad_t); + int oStartX = fmax(0, pad_l); + int oStartY = fmax(0, pad_t); + + long k, ip_x, ip_y; +#pragma omp parallel for private(k, ip_x, ip_y) + + for (k = 0; k < nslices; k++) + { + long i, j; + for (i = 0; i < oheight; i++) { + for (j = 0; j < owidth; j++) { + if (j < pad_l) { + ip_x = pad_l * 2 - j; + } else if (j >= pad_l && j < iwidth + pad_l) { + ip_x = j; + } else { + ip_x = (iwidth + pad_l - 1) * 2 - j; + } + ip_x = ip_x - oStartX + iStartX; + + if (i < pad_t) { + ip_y = pad_t * 2 - i; + } else if (i >= pad_t && i < iheight + pad_t) { + ip_y = i; + } else { + ip_y = (iheight + pad_t - 1) * 2 - i; + } + ip_y = ip_y - oStartY + iStartY; + + real *dest_p = output_p + k*owidth*oheight + i * owidth + j; + real *src_p = input_p + k*iwidth*iheight + ip_y * iwidth + ip_x; + *dest_p = *src_p; + } + } + } +} + +void THNN_(SpatialReflectionPadding_updateOutput)(THNNState *state, + THTensor *input, + THTensor *output, + int pad_l, int pad_r, + int pad_t, int pad_b) +{ + int dimw = 2; + int dimh = 1; + int dimslices = 0; + long nbatch = 1; + long nslices; + long iheight; + long iwidth; + long oheight; + long owidth; + real *input_data; + real *output_data; + + THNN_ARGCHECK(input->nDimension == 3 || input->nDimension == 4, 2, input, + "3D or 4D (batch mode) tensor expected for input, but got: %s"); + + if (input->nDimension == 4) + { + nbatch = input->size[0]; + dimw++; + dimh++; + dimslices++; + } + + /* sizes */ + nslices = input->size[dimslices]; + iheight = input->size[dimh]; + iwidth = input->size[dimw]; + oheight = iheight + pad_t + pad_b; + owidth = iwidth + pad_l + pad_r; + + THArgCheck(owidth >= 1 || oheight >= 1 , 2, + "input (H: %d, W: %d)is too small." + " Calculated output H: %d W: %d", + iheight, iwidth, oheight, owidth); + + /* get contiguous input */ + input = THTensor_(newContiguous)(input); + + /* resize output */ + if (input->nDimension == 3) + { + THTensor_(resize3d)(output, nslices, oheight, owidth); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + + THNN_(SpatialReflectionPadding_updateOutput_frame)(input_data, output_data, + nslices, + iwidth, iheight, + owidth, oheight, + pad_l, pad_r, + pad_t, pad_b); + } + else + { + long p; + + THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + +#pragma omp parallel for private(p) + for (p = 0; p < nbatch; p++) + { + THNN_(SpatialReflectionPadding_updateOutput_frame)( + input_data+p*nslices*iwidth*iheight, + output_data+p*nslices*owidth*oheight, + nslices, + iwidth, iheight, + owidth, oheight, + pad_l, pad_r, + pad_t, pad_b); + } + } + + /* cleanup */ + THTensor_(free)(input); +} + +static void THNN_(SpatialReflectionPadding_updateGradInput_frame)( + real *ginput_p, real *goutput_p, + long nslices, + long iwidth, long iheight, + long owidth, long oheight, + int pad_l, int pad_r, + int pad_t, int pad_b) +{ + int iStartX = fmax(0, -pad_l); + int iStartY = fmax(0, -pad_t); + int oStartX = fmax(0, pad_l); + int oStartY = fmax(0, pad_t); + + long k, ip_x, ip_y; +#pragma omp parallel for private(k, ip_x, ip_y) + + for (k = 0; k < nslices; k++) + { + long i, j; + for (i = 0; i < oheight; i++) { + for (j = 0; j < owidth; j++) { + if (j < pad_l) { + ip_x = pad_l * 2 - j; + } else if (j >= pad_l && j < iwidth + pad_l) { + ip_x = j; + } else { + ip_x = (iwidth + pad_l - 1) * 2 - j; + } + ip_x = ip_x - oStartX + iStartX; + + if (i < pad_t) { + ip_y = pad_t * 2 - i; + } else if (i >= pad_t && i < iheight + pad_t) { + ip_y = i; + } else { + ip_y = (iheight + pad_t - 1) * 2 - i; + } + ip_y = ip_y - oStartY + iStartY; + + real *src_p = goutput_p + k*owidth*oheight + i * owidth + j; + real *dest_p = ginput_p + k*iwidth*iheight + ip_y * iwidth + ip_x; + *dest_p += *src_p; + } + } + } +} + +void THNN_(SpatialReflectionPadding_updateGradInput)(THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + int pad_l, int pad_r, + int pad_t, int pad_b) +{ + int dimw = 2; + int dimh = 1; + int dimslices = 0; + long nbatch = 1; + long nslices; + long iheight; + long iwidth; + long oheight; + long owidth; + + if (input->nDimension == 4) + { + nbatch = input->size[0]; + dimw++; + dimh++; + dimslices++; + } + + /* sizes */ + nslices = input->size[dimslices]; + iheight = input->size[dimh]; + iwidth = input->size[dimw]; + oheight = iheight + pad_t + pad_b; + owidth = iwidth + pad_l + pad_r; + + THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3, + "gradOutput width unexpected. Expected: %d, Got: %d", + owidth, THTensor_(size)(gradOutput, dimw)); + THArgCheck(oheight == THTensor_(size)(gradOutput, dimh), 3, + "gradOutput height unexpected. Expected: %d, Got: %d", + oheight, THTensor_(size)(gradOutput, dimh)); + + /* get contiguous gradOutput */ + gradOutput = THTensor_(newContiguous)(gradOutput); + + /* resize */ + THTensor_(resizeAs)(gradInput, input); + THTensor_(zero)(gradInput); + + /* backprop */ + if (input->nDimension == 3) { + THNN_(SpatialReflectionPadding_updateGradInput_frame)( + THTensor_(data)(gradInput), + THTensor_(data)(gradOutput), + nslices, + iwidth, iheight, + owidth, oheight, + pad_l, pad_r, + pad_t, pad_b); + } else { + long p; +#pragma omp parallel for private(p) + for (p = 0; p < nbatch; p++) { + THNN_(SpatialReflectionPadding_updateGradInput_frame)( + THTensor_(data)(gradInput) + p * nslices * iheight * iwidth, + THTensor_(data)(gradOutput) + p * nslices * oheight * owidth, + nslices, + iwidth, iheight, + owidth, oheight, + pad_l, pad_r, + pad_t, pad_b); + } + } + + /* cleanup */ + THTensor_(free)(gradOutput); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialReplicationPadding.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialReplicationPadding.c new file mode 100644 index 000000000..4e318aa70 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialReplicationPadding.c @@ -0,0 +1,260 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SpatialReplicationPadding.c" +#else + +static void THNN_(SpatialReplicationPadding_updateOutput_frame)( + real *input_p, real *output_p, + long nslices, + long iwidth, long iheight, + long owidth, long oheight, + int pad_l, int pad_r, + int pad_t, int pad_b) +{ + int iStartX = fmax(0, -pad_l); + int iStartY = fmax(0, -pad_t); + int oStartX = fmax(0, pad_l); + int oStartY = fmax(0, pad_t); + + long k, ip_x, ip_y; +#pragma omp parallel for private(k, ip_x, ip_y) + for (k = 0; k < nslices; k++) + { + long i, j; + for (i = 0; i < oheight; i++) { + for (j = 0; j < owidth; j++) { + if (j < pad_l) { + ip_x = pad_l; + } else if (j >= pad_l && j < iwidth + pad_l) { + ip_x = j; + } else { + ip_x = iwidth + pad_l - 1; + } + ip_x = ip_x - oStartX + iStartX; + + if (i < pad_t) { + ip_y = pad_t; + } else if (i >= pad_t && i < iheight + pad_t) { + ip_y = i; + } else { + ip_y = iheight + pad_t - 1; + } + ip_y = ip_y - oStartY + iStartY; + + real *dest_p = output_p + k*owidth*oheight + i * owidth + j; + real *src_p = input_p + k*iwidth*iheight + ip_y * iwidth + ip_x; + *dest_p = *src_p; + } + } + } +} + +void THNN_(SpatialReplicationPadding_updateOutput)(THNNState *state, + THTensor *input, + THTensor *output, + int pad_l, int pad_r, + int pad_t, int pad_b) +{ + int dimw = 2; + int dimh = 1; + int dimslices = 0; + long nbatch = 1; + long nslices; + long iheight; + long iwidth; + long oheight; + long owidth; + real *input_data; + real *output_data; + + THNN_ARGCHECK(input->nDimension == 3 || input->nDimension == 4, 2, input, + "3D or 4D (batch mode) tensor expected for input, but got: %s"); + + if (input->nDimension == 4) + { + nbatch = input->size[0]; + dimw++; + dimh++; + dimslices++; + } + + /* sizes */ + nslices = input->size[dimslices]; + iheight = input->size[dimh]; + iwidth = input->size[dimw]; + oheight = iheight + pad_t + pad_b; + owidth = iwidth + pad_l + pad_r; + + THArgCheck(owidth >= 1 || oheight >= 1 , 2, + "input (H: %d, W: %d)is too small." + " Calculated output H: %d W: %d", + iheight, iwidth, oheight, owidth); + + + /* get contiguous input */ + input = THTensor_(newContiguous)(input); + + /* resize output */ + if (input->nDimension == 3) + { + THTensor_(resize3d)(output, nslices, oheight, owidth); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + + THNN_(SpatialReplicationPadding_updateOutput_frame)(input_data, output_data, + nslices, + iwidth, iheight, + owidth, oheight, + pad_l, pad_r, + pad_t, pad_b); + } + else + { + long p; + + THTensor_(resize4d)(output, nbatch, nslices, oheight, owidth); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + +#pragma omp parallel for private(p) + for (p = 0; p < nbatch; p++) + { + THNN_(SpatialReplicationPadding_updateOutput_frame)( + input_data+p*nslices*iwidth*iheight, + output_data+p*nslices*owidth*oheight, + nslices, + iwidth, iheight, + owidth, oheight, + pad_l, pad_r, + pad_t, pad_b); + } + } + + /* cleanup */ + THTensor_(free)(input); +} + +static void THNN_(SpatialReplicationPadding_updateGradInput_frame)( + real *ginput_p, real *goutput_p, + long nslices, + long iwidth, long iheight, + long owidth, long oheight, + int pad_l, int pad_r, + int pad_t, int pad_b) +{ + int iStartX = fmax(0, -pad_l); + int iStartY = fmax(0, -pad_t); + int oStartX = fmax(0, pad_l); + int oStartY = fmax(0, pad_t); + + long k, ip_x, ip_y; +#pragma omp parallel for private(k, ip_x, ip_y) + for (k = 0; k < nslices; k++) + { + long i, j; + for (i = 0; i < oheight; i++) { + for (j = 0; j < owidth; j++) { + if (j < pad_l) { + ip_x = pad_l; + } else if (j >= pad_l && j < iwidth + pad_l) { + ip_x = j; + } else { + ip_x = iwidth + pad_l - 1; + } + ip_x = ip_x - oStartX + iStartX; + + if (i < pad_t) { + ip_y = pad_t; + } else if (i >= pad_t && i < iheight + pad_t) { + ip_y = i; + } else { + ip_y = iheight + pad_t - 1; + } + ip_y = ip_y - oStartY + iStartY; + + real *src_p = goutput_p + k*owidth*oheight + i * owidth + j; + real *dest_p = ginput_p + k*iwidth*iheight + ip_y * iwidth + ip_x; + *dest_p += *src_p; + } + } + } +} + +void THNN_(SpatialReplicationPadding_updateGradInput)(THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + int pad_l, int pad_r, + int pad_t, int pad_b) +{ + int dimw = 2; + int dimh = 1; + int dimslices = 0; + long nbatch = 1; + long nslices; + long iheight; + long iwidth; + long oheight; + long owidth; + + if (input->nDimension == 4) + { + nbatch = input->size[0]; + dimw++; + dimh++; + dimslices++; + } + + /* sizes */ + nslices = input->size[dimslices]; + iheight = input->size[dimh]; + iwidth = input->size[dimw]; + oheight = iheight + pad_t + pad_b; + owidth = iwidth + pad_l + pad_r; + + THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3, + "gradOutput width unexpected. Expected: %d, Got: %d", + owidth, THTensor_(size)(gradOutput, dimw)); + THArgCheck(oheight == THTensor_(size)(gradOutput, dimh), 3, + "gradOutput height unexpected. Expected: %d, Got: %d", + oheight, THTensor_(size)(gradOutput, dimh)); + + /* get contiguous gradOutput */ + gradOutput = THTensor_(newContiguous)(gradOutput); + + /* resize */ + THTensor_(resizeAs)(gradInput, input); + THTensor_(zero)(gradInput); + + /* backprop */ + if (input->nDimension == 3) { + THNN_(SpatialReplicationPadding_updateGradInput_frame)( + THTensor_(data)(gradInput), + THTensor_(data)(gradOutput), + nslices, + iwidth, iheight, + owidth, oheight, + pad_l, pad_r, + pad_t, pad_b); + } else { + long p; +#pragma omp parallel for private(p) + for (p = 0; p < nbatch; p++) { + THNN_(SpatialReplicationPadding_updateGradInput_frame)( + THTensor_(data)(gradInput) + p * nslices * iheight * iwidth, + THTensor_(data)(gradOutput) + p * nslices * oheight * owidth, + nslices, + iwidth, iheight, + owidth, oheight, + pad_l, pad_r, + pad_t, pad_b); + } + } + + /* cleanup */ + THTensor_(free)(gradOutput); +} + + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialSubSampling.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialSubSampling.c new file mode 100644 index 000000000..4c077bc64 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialSubSampling.c @@ -0,0 +1,302 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SpatialSubSampling.c" +#else + +static inline void THNN_(SpatialSubSampling_shapeCheck)( + THTensor *input, + THTensor *gradOutput, + THTensor *weight, + int kW, int kH) { + int ndims = input->nDimension; + THNN_ARGCHECK(input->nDimension == 3 || input->nDimension == 4, 2, input, + "3D or 4D input tensor expected but got: %s"); + THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous"); + + int nInputPlane = THTensor_(size)(weight, 0); + + int dimw = 2; + int dimh = 1; + + long inputWidth; + long inputHeight; + + if (input->nDimension == 4) { + dimw++; + dimh++; + } + + inputWidth = input->size[dimw]; + inputHeight = input->size[dimh]; + + THArgCheck(input->size[dimh-1] == nInputPlane, 2, "invalid number of input planes"); + THArgCheck(inputWidth >= kW && inputHeight >= kH, 2, "input image smaller than kernel size"); +} + +void THNN_(SpatialSubSampling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + int kW, int kH, + int dW, int dH) +{ + THArgCheck(!bias || THTensor_(isContiguous)(bias), 5, "bias must be contiguous"); + + real *weight_data = THTensor_(data)(weight); + real *bias_data = THTensor_(data)(bias); + real *output_data; + real *input_data; + + int dimw = 2; + int dimh = 1; + long nbatch = 1; + + long inputWidth; + long inputHeight; + long outputWidth; + long outputHeight; + + int nInputPlane = THTensor_(size)(weight,0); + + long k; + + THNN_(SpatialSubSampling_shapeCheck)(input, NULL, weight, kW, kH); + + if (input->nDimension == 4) { + nbatch = input->size[0]; + dimw++; + dimh++; + } + + inputWidth = input->size[dimw]; + inputHeight = input->size[dimh]; + outputWidth = (inputWidth - kW) / dW + 1; + outputHeight = (inputHeight - kH) / dH + 1; + + if (input->nDimension == 3) + THTensor_(resize3d)(output, nInputPlane, outputHeight, outputWidth); + else + THTensor_(resize4d)(output, input->size[0], nInputPlane, outputHeight, outputWidth); + + input = THTensor_(newContiguous)(input); + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + +#pragma omp parallel for private(k) + for(k = 0; k < nInputPlane; k++) + { + long p; + for(p = 0; p < nbatch; p++) + { + long xx, yy; + /* For all output pixels... */ + real *ptr_output = output_data + p*nInputPlane*outputWidth*outputHeight + k*outputWidth*outputHeight; + /* Get the good mask for (k,i) (k out, i in) */ + real the_weight = weight_data[k]; + /* Initialize to the bias */ + real z = bias_data[k]; + long i; + for(i = 0; i < outputWidth*outputHeight; i++) + ptr_output[i] = z; + + for(yy = 0; yy < outputHeight; yy++) + { + for(xx = 0; xx < outputWidth; xx++) + { + /* Compute the mean of the input image... */ + real *ptr_input = input_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight + yy*dH*inputWidth+xx*dW; + real sum = 0; + long kx, ky; + + for(ky = 0; ky < kH; ky++) + { + for(kx = 0; kx < kW; kx++) + sum += ptr_input[kx]; + ptr_input += inputWidth; /* next input line */ + } + /* Update output */ + *ptr_output++ += the_weight*sum; + } + } + } + } + THTensor_(free)(input); +} + +void THNN_(SpatialSubSampling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + int kW, int kH, + int dW, int dH) +{ + THNN_(SpatialSubSampling_shapeCheck)(input, gradOutput, weight, kW, kH); + + int dimw = 2; + int dimh = 1; + long nbatch = 1; + + long inputWidth; + long inputHeight; + long outputWidth; + long outputHeight; + + int nInputPlane = THTensor_(size)(weight,0); + + real *weight_data; + real *gradOutput_data; + real *input_data, *gradInput_data; + + long k; + + if (input->nDimension == 4) { + nbatch = input->size[0]; + dimw++; + dimh++; + } + + inputWidth = input->size[dimw]; + inputHeight = input->size[dimh]; + outputWidth = (inputWidth - kW) / dW + 1; + outputHeight = (inputHeight - kH) / dH + 1; + + weight_data = THTensor_(data)(weight); + gradOutput = THTensor_(newContiguous)(gradOutput); + gradOutput_data = THTensor_(data)(gradOutput); + + input_data = THTensor_(data)(input); + + THTensor_(resizeAs)(gradInput, input); + gradInput_data = THTensor_(data)(gradInput); + +#pragma omp parallel for private(k) + for(k = 0; k < nInputPlane; k++) + { + long p; + for(p = 0; p < nbatch; p++) + { + real the_weight = weight_data[k]; + real *ptr_gradOutput = gradOutput_data + p*nInputPlane*outputHeight*outputWidth + k*outputWidth*outputHeight; + long xx, yy; + + real* ptr_gi = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight; + long i; + for(i=0; i<inputWidth*inputHeight; i++) + ptr_gi[i] = 0.0; + + for(yy = 0; yy < outputHeight; yy++) + { + for(xx = 0; xx < outputWidth; xx++) + { + real *ptr_gradInput = gradInput_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight + yy*dH*inputWidth+xx*dW; + real z = *ptr_gradOutput++ * the_weight; + long kx, ky; + + for(ky = 0; ky < kH; ky++) + { + for(kx = 0; kx < kW; kx++) + ptr_gradInput[kx] += z; + ptr_gradInput += inputWidth; + } + } + } + } + } + THTensor_(free)(gradOutput); +} + +void THNN_(SpatialSubSampling_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + int kW, int kH, + int dW, int dH, + accreal scale_) +{ + real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); + THNN_(SpatialSubSampling_shapeCheck)(input, gradOutput, gradWeight, kW, kH); + + long nbatch = 1; + long dimw = 2; + long dimh = 1; + + long inputWidth; + long inputHeight; + long outputWidth; + long outputHeight; + + int nInputPlane = THTensor_(size)(gradWeight,0); + + real *gradWeight_data; + real *gradBias_data; + real *gradOutput_data; + real *input_data; + + long k; + + if (input->nDimension == 4) { + dimw++; + dimh++; + nbatch = input->size[0]; + } + + inputWidth = input->size[dimw]; + inputHeight = input->size[dimh]; + outputWidth = (inputWidth - kW) / dW + 1; + outputHeight = (inputHeight - kH) / dH + 1; + + gradWeight_data = THTensor_(data)(gradWeight); + gradBias_data = THTensor_(data)(gradBias); + gradOutput = THTensor_(newContiguous)(gradOutput); + gradOutput_data = THTensor_(data)(gradOutput); + + input = THTensor_(newContiguous)(input); + input_data = THTensor_(data)(input); + +#pragma omp parallel for private(k) + for(k = 0; k < nInputPlane; k++) + { + long p; + for(p = 0; p < nbatch; p++) + { + real *ptr_gradOutput = gradOutput_data + p*nInputPlane*outputHeight*outputWidth + k*outputWidth*outputHeight; + real sum; + long xx, yy; + long i; + + sum = 0; + for(i = 0; i < outputWidth*outputHeight; i++) + sum += ptr_gradOutput[i]; + gradBias_data[k] += scale*sum; + + sum = 0; + for(yy = 0; yy < outputHeight; yy++) + { + for(xx = 0; xx < outputWidth; xx++) + { + real *ptr_input = input_data + p*nInputPlane*inputWidth*inputHeight + k*inputWidth*inputHeight + yy*dH*inputWidth+xx*dW; + real z = *ptr_gradOutput++; + long kx, ky; + + for(ky = 0; ky < kH; ky++) + { + for(kx = 0; kx < kW; kx++) + sum += z * ptr_input[kx]; + ptr_input += inputWidth; + } + } + } + gradWeight_data[k] += scale*sum; + } + } + + THTensor_(free)(input); + THTensor_(free)(gradOutput); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialUpSamplingBilinear.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialUpSamplingBilinear.c new file mode 100644 index 000000000..8bc487ead --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialUpSamplingBilinear.c @@ -0,0 +1,174 @@ +// Adapted from interp.cpp from Caffe util by Pauline Luc +// Originally developed by George Papandreou + +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SpatialUpSamplingBilinear.c" +#else + +static inline void THNN_(SpatialUpSamplingBilinear_shapeCheck) + (THTensor *input, THTensor *gradOutput, + int nBatch, int nChannels, + int inputHeight, int inputWidth, + int outputHeight, int outputWidth) { + THArgCheck(inputHeight > 0 && inputWidth > 0 + && outputHeight > 0 && outputWidth > 0, 2, + "input and output sizes should be greater than 0," + " but got input (H: %d, W: %d) output (H: %d, W: %d)", + inputHeight, inputWidth, outputHeight, outputWidth); + if (input != NULL) { + THNN_ARGCHECK(input->nDimension == 4, 2, input, + "4D input tensor expected but got: %s"); + } + + if (gradOutput != NULL) { + THNN_CHECK_DIM_SIZE(gradOutput, 4, 0, nBatch); + THNN_CHECK_DIM_SIZE(gradOutput, 4, 1, nChannels); + THNN_CHECK_DIM_SIZE(gradOutput, 4, 2, outputHeight); + THNN_CHECK_DIM_SIZE(gradOutput, 4, 3, outputWidth); + } +} + +void THNN_(SpatialUpSamplingBilinear_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int outputHeight, + int outputWidth){ + + int nbatch = THTensor_(size)(input, 0); + int channels = THTensor_(size)(input, 1); + int inputHeight = THTensor_(size)(input, 2); + int inputWidth = THTensor_(size)(input, 3); + + THNN_(SpatialUpSamplingBilinear_shapeCheck) + (input, NULL, + nbatch, channels, + inputHeight, inputWidth, + outputHeight, outputWidth); + + input = THTensor_(newContiguous)(input); + THTensor_(resize4d)(output, + THTensor_(size)(input, 0), + THTensor_(size)(input, 1), + outputHeight, outputWidth); + THTensor_(zero)(output); + real *idata = THTensor_(data)(input); + real *odata = THTensor_(data)(output); + channels = nbatch * channels; + THAssert(inputHeight > 0 && inputWidth > 0 && outputHeight > 0 && outputWidth > 0); + // special case: just copy + if (inputHeight == outputHeight && inputWidth == outputWidth) { + for (int h2 = 0; h2 < outputHeight; ++h2) { + const int h1 = h2; + for (int w2 = 0; w2 < outputWidth; ++w2) { + const int w1 = w2; + const real* pos1 = &idata[h1 * inputWidth + w1]; + real* pos2 = &odata[h2 * outputWidth + w2]; + for (int c = 0; c < channels; ++c) { + pos2[0] = pos1[0]; + pos1 += inputWidth * inputHeight; + pos2 += outputWidth * outputHeight; + } + } + } + return; + } + const float rheight =(outputHeight > 1) ? (float)(inputHeight - 1)/(outputHeight - 1) : 0.f; + const float rwidth = (outputWidth > 1) ? (float)(inputWidth - 1) / (outputWidth - 1) : 0.f; + for (int h2 = 0; h2 < outputHeight; ++h2) { + const float h1r = rheight * h2; + const int h1 = h1r; + const int h1p = (h1 < inputHeight - 1) ? 1 : 0; + const real h1lambda = h1r - h1; + const real h0lambda = (real)1. - h1lambda; + for (int w2 = 0; w2 < outputWidth; ++w2) { + const float w1r = rwidth * w2; + const int w1 = w1r; + const int w1p = (w1 < inputWidth - 1) ? 1 : 0; + const real w1lambda = w1r - w1; + const real w0lambda = (real)1. - w1lambda; + const real* pos1 = &idata[h1 * inputWidth + w1]; + real* pos2 = &odata[h2 * outputWidth + w2]; + for (int c = 0; c < channels; ++c) { + pos2[0] = h0lambda * (w0lambda * pos1[0]+ w1lambda * pos1[w1p]) + + h1lambda * (w0lambda * pos1[h1p * inputWidth] + + w1lambda * pos1[h1p * inputWidth + w1p]); + pos1 += inputWidth * inputHeight; + pos2 += outputWidth * outputHeight; + } + } + } + THTensor_(free)(input); +} + +void THNN_(SpatialUpSamplingBilinear_updateGradInput)( + THNNState *state, + THTensor *gradOutput, + THTensor *gradInput, + int nbatch, + int channels, + int inputHeight, + int inputWidth, + int outputHeight, + int outputWidth){ + + THNN_(SpatialUpSamplingBilinear_shapeCheck) + (NULL, gradOutput, + nbatch, channels, + inputHeight, inputWidth, + outputHeight, outputWidth); + + THTensor_(resize4d)(gradInput, nbatch, channels, inputHeight, inputWidth); + THTensor_(zero)(gradInput); + gradOutput = THTensor_(newContiguous)(gradOutput); + real *data1 = THTensor_(data)(gradInput); + real *data2 = THTensor_(data)(gradOutput); + channels = nbatch * channels; + + // special case: same-size matching grids + if (inputHeight == outputHeight && inputWidth == outputWidth) { + for (int h2 = 0; h2 < outputHeight; ++h2) { + const int h1 = h2; + for (int w2 = 0; w2 < outputWidth; ++w2) { + const int w1 = w2; + real* pos1 = &data1[h1 * inputWidth + w1]; + const real* pos2 = &data2[h2 * outputWidth + w2]; + for (int c = 0; c < channels; ++c) { + pos1[0] += pos2[0]; + pos1 += inputWidth * inputHeight; + pos2 += outputWidth * outputHeight; + } + } + } + return; + } + const float rheight =(outputHeight > 1) ? (float)(inputHeight - 1)/(outputHeight - 1) : 0.f; + const float rwidth = (outputWidth > 1) ? (float)(inputWidth - 1)/(outputWidth - 1) : 0.f; + for (int h2 = 0; h2 < outputHeight; ++h2) { + const float h1r = rheight * h2; + const int h1 = h1r; + const int h1p = (h1 < inputHeight - 1) ? 1 : 0; + const real h1lambda = h1r - h1; + const real h0lambda = (real)1. - h1lambda; + for (int w2 = 0; w2 < outputWidth; ++w2) { + const float w1r = rwidth * w2; + const int w1 = w1r; + const int w1p = (w1 < inputWidth - 1) ? 1 : 0; + const real w1lambda = w1r - w1; + const real w0lambda = (real)1. - w1lambda; + real* pos1 = &data1[h1 * inputWidth + w1]; + const real* pos2 = &data2[h2 * outputWidth + w2]; + for (int c = 0; c < channels; ++c) { + pos1[0] += h0lambda * w0lambda * pos2[0]; + pos1[w1p] += h0lambda * w1lambda * pos2[0]; + pos1[h1p * inputWidth] += h1lambda * w0lambda * pos2[0]; + pos1[h1p * inputWidth + w1p] += h1lambda * w1lambda * pos2[0]; + pos1 += inputWidth * inputHeight; + pos2 += outputWidth * outputHeight; + } + } + } + THTensor_(free)(gradOutput); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/SpatialUpSamplingNearest.c b/contrib/lua-torch/nn/lib/THNN/generic/SpatialUpSamplingNearest.c new file mode 100644 index 000000000..b4699ff3e --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/SpatialUpSamplingNearest.c @@ -0,0 +1,199 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/SpatialUpSamplingNearest.c" +#else + + +static inline void THNN_(SpatialUpSamplingNearest_shapeCheck) + (THTensor *input, THTensor *gradOutput, + int scale_factor) { + THArgCheck(input != NULL, 2, "4D input tensor expected but got NULL"); + THArgCheck(scale_factor > 1, 4, + "scale_factor must be greater than 1, but got: %d", scale_factor); + THNN_ARGCHECK(input->nDimension == 3 || input->nDimension == 4, 2, input, + "3D or 4D input tensor expected but got: %s"); + if (input->nDimension == 3) { + int nChannels = THTensor_(size)(input, 0); + int inputHeight = THTensor_(size)(input, 1); + int inputWidth = THTensor_(size)(input, 2); + int outputHeight = inputHeight * scale_factor; + int outputWidth = inputWidth * scale_factor; + if (gradOutput != NULL) { + THNN_CHECK_DIM_SIZE(gradOutput, 3, 0, nChannels); + THNN_CHECK_DIM_SIZE(gradOutput, 3, 1, outputHeight); + THNN_CHECK_DIM_SIZE(gradOutput, 3, 2, outputWidth); + } + } else { + int nBatch = THTensor_(size)(input, 0); + int nChannels = THTensor_(size)(input, 1); + int inputHeight = THTensor_(size)(input, 2); + int inputWidth = THTensor_(size)(input, 3); + int outputHeight = inputHeight * scale_factor; + int outputWidth = inputWidth * scale_factor; + if (gradOutput != NULL) { + THNN_CHECK_DIM_SIZE(gradOutput, 4, 0, nBatch); + THNN_CHECK_DIM_SIZE(gradOutput, 4, 1, nChannels); + THNN_CHECK_DIM_SIZE(gradOutput, 4, 2, outputHeight); + THNN_CHECK_DIM_SIZE(gradOutput, 4, 3, outputWidth); + } + } +} + +void THNN_(SpatialUpSamplingNearest_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int scale_factor) +{ + THNN_(SpatialUpSamplingNearest_shapeCheck)(input, NULL, scale_factor); + int inputHeight = THTensor_(size)(input, input->nDimension-2); + int inputWidth = THTensor_(size)(input, input->nDimension-1); + int outputHeight = inputHeight * scale_factor; + int outputWidth = inputWidth * scale_factor; + + if (input->nDimension == 3) { + THTensor_(resize3d)(output, + THTensor_(size)(input, 0), + outputHeight, outputWidth); + } else { + THTensor_(resize4d)(output, + THTensor_(size)(input, 0), + THTensor_(size)(input, 1), + outputHeight, outputWidth); + } + + int dW = scale_factor; + int dH = scale_factor; + int xDim = input->nDimension-2; + int yDim = input->nDimension-1; + + // dims + int idim = input->nDimension; + int osz0 = output->size[0]; + int osz1 = output->size[1]; + int osz2 = output->size[2]; + int osz3 = 1; + if (idim > 3) { + osz3 = output->size[3]; + } + + // get strides + long *is = input->stride; + long *os = output->stride; + + // get raw pointers + real *pin = THTensor_(data)(input); + real *pout = THTensor_(data)(output); + + // perform the upsampling + int i0, i1, i2, i3, isrc, idst; + int iout[4]; // Output indices + int iin[4]; // Input indices + + for (i0 = 0; i0 < osz0; i0++) { + iout[0] = i0; + iin[0] = i0; + for (i1 = 0; i1 < osz1; i1++) { + iout[1] = i1; + iin[1] = i1; + for (i2 = 0; i2 < osz2; i2++) { + iout[2] = i2; + iin[2] = i2; + for (i3 = 0; i3 < osz3; i3++) { + iout[3] = i3; + iin[3] = i3; + + // set the indices for the upsampled dimensions + iin[xDim] = iout[xDim] / dW; + iin[yDim] = iout[yDim] / dH; + + idst = i0*os[0] + i1*os[1] + i2*os[2]; + isrc = iin[0]*is[0] + iin[1]*is[1] + iin[2]*is[2]; + if (idim > 3) { + idst += i3*os[3]; + isrc += iin[3]*is[3]; + } + + pout[idst] = pin[isrc]; + } + } + } + } +} + +void THNN_(SpatialUpSamplingNearest_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + int scale_factor) +{ + THNN_(SpatialUpSamplingNearest_shapeCheck)(input, gradOutput, scale_factor); + THTensor_(resizeAs)(gradInput, input); + + int dW = scale_factor; + int dH = scale_factor; + int xDim = gradInput->nDimension-2; + int yDim = gradInput->nDimension-1; + + // dims + int idim = gradInput->nDimension; // Guaranteed to be between 3 and 5 + int isz0 = gradInput->size[0]; + int isz1 = gradInput->size[1]; + int isz2 = gradInput->size[2]; + int isz3 = 1; + if (idim > 3) { + isz3 = gradInput->size[3]; + } + + // get strides + long *is = gradInput->stride; + long *os = gradOutput->stride; + + // get raw pointers + real *pin = THTensor_(data)(gradInput); + real *pout = THTensor_(data)(gradOutput); + + // perform the upsampling + int i0, i1, i2, i3, isrc, idst, x, y; + int iin[4]; // Input indices + int iout[4]; // Output indices + + THTensor_(zero)(gradInput); + + for (i0 = 0; i0 < isz0; i0++) { + iin[0] = i0; + iout[0] = i0; + for (i1 = 0; i1 < isz1; i1++) { + iin[1] = i1; + iout[1] = i1; + for (i2 = 0; i2 < isz2; i2++) { + iin[2] = i2; + iout[2] = i2; + for (i3 = 0; i3 < isz3; i3++) { + iin[3] = i3; + iout[3] = i3; + + idst = i0*is[0] + i1*is[1] + i2*is[2]; + if (idim > 3) { + idst += i3*is[3]; + } + + // Now accumulate the gradients from gradOutput + for (y = 0; y < dH; y++) { + for (x = 0; x < dW; x++) { + iout[xDim] = dW * iin[xDim] + x; + iout[yDim] = dH * iin[yDim] + y; + isrc = iout[0]*os[0] + iout[1]*os[1] + iout[2]*os[2]; + if (idim > 3) { + isrc += iout[3]*os[3]; + } + pin[idst] += pout[isrc]; + } + } + } + } + } + } +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/Sqrt.c b/contrib/lua-torch/nn/lib/THNN/generic/Sqrt.c new file mode 100644 index 000000000..174884e34 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/Sqrt.c @@ -0,0 +1,52 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/Sqrt.c" +#else + +void THNN_(Sqrt_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + accreal eps_) +{ + real eps = TH_CONVERT_ACCREAL_TO_REAL(eps_); + THTensor_(resizeAs)(output, input); + THTensor_(sqrt)(output, input); +} + +void THNN_(Sqrt_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *output) +{ + THNN_CHECK_SHAPE(output, gradOutput); + THTensor_(resizeAs)(gradInput, input); + + if (output->nDimension == 1 || + !THTensor_(isContiguous)(output) || + !THTensor_(isContiguous)(gradOutput) || + !THTensor_(isContiguous)(gradInput)) + { + TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output, + *gradInput_data = (*output_data == 0.0) ? 0.0 : (0.5 * (*gradOutput_data / *output_data)); + ); + } + else + { + real *gradOutput_data = THTensor_(data)(gradOutput); + real *gradInput_data = THTensor_(data)(gradInput); + real *output_data = THTensor_(data)(output); + long i; +#pragma omp parallel for private(i) + for(i = 0; i < THTensor_(nElement)(output); i++) + { + if (output_data[i] == 0.0) + gradInput_data[i] = 0.0; + else + gradInput_data[i] = 0.5 * (gradOutput_data[i] / output_data[i]); + } + } +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/Square.c b/contrib/lua-torch/nn/lib/THNN/generic/Square.c new file mode 100644 index 000000000..aad0a911c --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/Square.c @@ -0,0 +1,59 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/Square.c" +#else + +void THNN_(Square_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output) +{ + THTensor_(resizeAs)(output, input); + + if (input->nDimension == 1 || !THTensor_(isContiguous)(input) || !THTensor_(isContiguous)(output)) + { + TH_TENSOR_APPLY2(real, output, real, input, + *output_data = (*input_data) * (*input_data); + ); + } + else + { + real *output_data = THTensor_(data)(output); + real *input_data = THTensor_(data)(input); + long i; +#pragma omp parallel for private(i) + for (i = 0; i < THTensor_(nElement)(input); i++) + output_data[i] = input_data[i]*input_data[i]; + } +} + +void THNN_(Square_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput) +{ + THNN_CHECK_SHAPE(input, gradOutput); + THTensor_(resizeAs)(gradInput, input); + + if (input->nDimension == 1 || + !THTensor_(isContiguous)(input) || + !THTensor_(isContiguous)(gradOutput) || + !THTensor_(isContiguous)(gradInput)) + { + TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input, + *gradInput_data = 2.0 * (*gradOutput_data) * (*input_data); + ); + } + else + { + real *gradOutput_data = THTensor_(data)(gradOutput); + real *gradInput_data = THTensor_(data)(gradInput); + real *input_data = THTensor_(data)(input); + long i; +#pragma omp parallel for private(i) + for (i = 0; i < THTensor_(nElement)(gradInput); i++) + gradInput_data[i] = 2.0 * gradOutput_data[i] * input_data[i]; + } +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/THNN.h b/contrib/lua-torch/nn/lib/THNN/generic/THNN.h new file mode 100644 index 000000000..76a28eb2d --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/THNN.h @@ -0,0 +1,1501 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/THNN.h" +#else + +TH_API void THNN_(Abs_updateOutput)( + THNNState *state, // library's state + THTensor *input, // input tensor + THTensor *output); // [OUT] Abs output +TH_API void THNN_(Abs_updateGradInput)( + THNNState *state, // library's state + THTensor *input, // input tensor + THTensor *gradOutput, // gradient w.r.t. output + THTensor *gradInput); // [OUT] gradient w.r.t. input + +TH_API void THNN_(AbsCriterion_updateOutput)( + THNNState *state, // library's state + THTensor *input, // input tensor + THTensor *target, // tensor with target values + THTensor *output, // [OUT] a one-element tensor with loss + bool sizeAverage); // if true, the loss will be divided by batch size +TH_API void THNN_(AbsCriterion_updateGradInput)( + THNNState *state, // library's state + THTensor *input, // input tensor + THTensor *target, // tensor with target values + THTensor *gradInput, // [OUT] gradient w.r.t. input + bool sizeAverage); // if true, the gradient will be normalized by batch size + +TH_API void THNN_(BCECriterion_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *target, + THTensor *output, + bool sizeAverage, + THTensor *weights); // [OPTIONAL] +TH_API void THNN_(BCECriterion_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *target, + THTensor *gradInput, + bool sizeAverage, + THTensor *weights); // [OPTIONAL] + +TH_API void THNN_(ClassNLLCriterion_updateOutput)( + THNNState *state, // library's state + THTensor *input, // input tensor (1D/2D) + THIndexTensor *target, // tensor containing indexes of target classes + THTensor *output, // [OUT] a one-element tensor with loss + bool sizeAverage, // if true, the loss will be normalized by batch size and class weights + THTensor *weights, // [OPTIONAL] class weights + THTensor *total_weight, // [BUFFER] + long ignore_index); // target index to ignore (loss = 0, gradInput = 0) +TH_API void THNN_(ClassNLLCriterion_updateGradInput)( + THNNState *state, // library's state + THTensor *input, // input tensor (1D/2D) + THIndexTensor *target, // tensor containing indexes of target classes + THTensor *gradInput, // [OUT] gradient w.r.t. input + bool sizeAverage, // if true, the loss will be normalized by batch size and class weights + THTensor *weights, // [OPTIONAL] class weights + THTensor *total_weight, // [BUFFER] + long ignore_index); // target index to ignore (loss = 0, gradInput = 0) + +TH_API void THNN_(SpatialClassNLLCriterion_updateOutput)( + THNNState *state, // library's state + THTensor *input, // input tensor (4D) + THIndexTensor *target, // tensor containing indexes of target classes (3D) + THTensor *output, // [OUT] a one-element tensor with loss + bool sizeAverage, // if true, the loss will be normalized by batch size and class weights + THTensor *weights, // [OPTIONAL] class weights + THTensor *total_weight); // [BUFFER] +TH_API void THNN_(SpatialClassNLLCriterion_updateGradInput)( + THNNState *state, // library's state + THTensor *input, // input tensor (4D) + THIndexTensor *target, // tensor containing indexes of target classes (3D) + THTensor *gradInput, // [OUT] gradient w.r.t. input + bool sizeAverage, // if true, the loss will be normalized by batch size and class weights + THTensor *weights, // [OPTIONAL] class weights + THTensor *total_weight); // [BUFFER] + +TH_API void THNN_(ELU_updateOutput)( + THNNState *state, // library's state + THTensor *input, // input tensor + THTensor *output, // [OUT] ELU output + accreal alpha, // an ELU parameter (as in paper) + bool inplace); // if true, modifies gradOutput and sets gradInput onto it (no additional memory is allocated) +TH_API void THNN_(ELU_updateGradInput)( + THNNState *state, // library's state + THTensor *input, // input tensor + THTensor *gradOutput, // gradient w.r.t. output + THTensor *gradInput, // [OUT] gradient w.r.t. input + THTensor *output, // output from a forward pass + accreal alpha, // an ELU parameter (as in paper) + bool inplace); // if true, modifies gradOutput and sets gradInput onto it (no additional memory is allocated) + +TH_API void THNN_(DistKLDivCriterion_updateOutput)( + THNNState *state, // library's state + THTensor *input, // input tensor + THTensor *target, // target tensor + THTensor *output, // [OUT] a one-element tensor containing the loss + bool sizeAverage); // if true, the loss will be normalized **by total number of elements** +TH_API void THNN_(DistKLDivCriterion_updateGradInput)( + THNNState *state, // library's state + THTensor *input, // input tensor + THTensor *target, // target tensor + THTensor *gradInput, // [OUT] gradient w.r.t. input + bool sizeAverage); // if true, the loss will be normalized **by total number of elements** + +TH_API void THNN_(GatedLinear_updateOutput)( + THNNState *state, // library's state + THTensor *input, // input tensor + THTensor *output, // [OUT] output tensor, half size of input along dimension dim + int dim); // dimension for halving operation +TH_API void THNN_(GatedLinear_updateGradInput)( + THNNState *state, // library's state + THTensor *input, // input tensor + THTensor *gradOutput, // gradient w.r.t module's output + THTensor *gradInput, // [OUT] gradient w.r.t input + int dim); // dimension for halving operation + +// HardShink outputs 0 on interval of (-lambda; lambda) or original value otherwise. +TH_API void THNN_(HardShrink_updateOutput)( + THNNState *state, // library's state + THTensor *input, // input tensor + THTensor *output, // [OUT] output tensor + accreal lambda); // HardShrink parameter +TH_API void THNN_(HardShrink_updateGradInput)( + THNNState *state, // library's state + THTensor *input, // input tensor + THTensor *gradOutput, // gradient w.r.t. module's output + THTensor *gradInput, // [OUT] gradient w.r.t. input + accreal lambda); // HardShrink parameter + +// HardTanh clamps the values to the interval [min_val; max_val]. +TH_API void THNN_(HardTanh_updateOutput)( + THNNState *state, // library's state + THTensor *input, // input tensor + THTensor *output, // [OUT] output tensor + accreal min_val, // lower threshold + accreal max_val, // upper threshold + bool inplace); +TH_API void THNN_(HardTanh_updateGradInput)( + THNNState *state, // library's state + THTensor *input, // input tensor + THTensor *gradOutput, // gradient w.r.t. module's output + THTensor *gradInput, // [OUT] gradient w.r.t. the input + accreal min_val, // lower threshold + accreal max_val, // upper threshold + bool inplace); + +TH_API void THNN_(L1Cost_updateOutput)( + THNNState *state, // library's state + THTensor *input, // input tensor + THTensor *output); // [OUT] output tensor +TH_API void THNN_(L1Cost_updateGradInput)( + THNNState *state, // library's state + THTensor *input, // input tensor + THTensor *gradOutput, // [OPTIONAL] gradient w.r.t module's output + THTensor *gradInput); // [OUT] gradient w.r.t the input + +TH_API void THNN_(LeakyReLU_updateOutput)( + THNNState *state, // library's state + THTensor *input, // [MODIFIED] input tensor + THTensor *output, // [OUT] output tensor + accreal negval, // negative part slope + bool inplace); // if true, modifies the input tensor and sets the output tensor on it (no additional memory is allocated) +TH_API void THNN_(LeakyReLU_updateGradInput)( + THNNState *state, // library's state + THTensor *input, // input tensor + THTensor *gradOutput, // [MODIFIED] gradient w.r.t. module's output + THTensor *gradInput, // [OUT] gradient w.r.t. the input + accreal negval, // negative part slope + bool inplace); // if true, modifies gradOutput and sets gradInput onto it (no additional memory is allocated) + +TH_API void THNN_(GRUFused_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *hidden, + THTensor *bias1, // [OPTIONAL] + THTensor *bias2, // [OPTIONAL] + THTensor *hx, + THTensor *output, + THTensor *storage); +TH_API void THNN_(GRUFused_updateGradInput)( + THNNState *state, + THTensor *gradInInput, + THTensor *gradInHidden, + THTensor *gradOutput, + THTensor *gradInputHx, + THTensor *storage); + +TH_API void THNN_(LSTMFused_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *hidden, + THTensor *bias1, // [OPTIONAL] + THTensor *bias2, // [OPTIONAL] + THTensor *cell, + THTensor *output, + THTensor *outputCell); +TH_API void THNN_(LSTMFused_updateGradInput)( + THNNState *state, + THTensor *storage, + THTensor *gradInGates, + THTensor *cx, + THTensor *cy, + THTensor *gradOutput, + THTensor *gradOutputCell, + THTensor *gradInputCx); + +TH_API void THNN_(LogSigmoid_updateOutput)( + THNNState *state, // library's state + THTensor *input, // input tensor + THTensor *output, // output tensor + THTensor *buffer); // [BUFFER] +TH_API void THNN_(LogSigmoid_updateGradInput)( + THNNState *state, // library's state + THTensor *input, // input + THTensor *gradOutput, // gradient w.r.t. module's output + THTensor *gradInput, // [OUT] gradient w.r.t. input + THTensor *buffer); // [BUFFER] + +TH_API void THNN_(LogSoftMax_updateOutput)( + THNNState *state, // library's state + THTensor *input, // input tensor + THTensor *output); // [OUT] output tensor +TH_API void THNN_(LogSoftMax_updateGradInput)( + THNNState *state, // library's state + THTensor *input, // input tensor + THTensor *gradOutput, // gradient w.r.t. module's output + THTensor *gradInput, // [OUT] gradient w.r.t. input + THTensor *output); // module's output + +TH_API void THNN_(LookupTable_accGradParameters)( + THNNState *state, + THIndexTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THIntegerTensor *count, + THTensor *sorted, // [OPTIONAL] + THIndexTensor *indices, // [OPTIONAL] + bool scaleGradByFreq, + int paddingValue, + accreal scale); + +TH_API void THNN_(LookupTable_renorm)( + THNNState *state, // library's state + THIndexTensor *idx, // vector containing row indices (modified in function) + THTensor *weight, // 2D tensor whose rows will be renormalized + accreal maxNorm, // maximum norm + accreal normType); // the norm type (e.g., normType=2, then it's 2-norm) + +TH_API void THNN_(MarginCriterion_updateOutput)( + THNNState *state, // library's state + THTensor *input, // input tensor + THTensor *target, // target tensor (should contain only 1s and -1s) + THTensor *output, // [OUT] a one-element tensor containing the loss + bool sizeAverage, // if true, the loss is normalized by **total number of elements** + accreal margin); // a margin that is required for the loss to be 0 + +TH_API void THNN_(MarginCriterion_updateGradInput)( + THNNState *state, // library's state + THTensor *input, // input tensor + THTensor *target, // target tensor (should contin only 1s and -1s) + THTensor *gradInput, // [OUT] gradient w.r.t. module's input + bool sizeAverage, // if true, the gradient is normalized by **total number of elements** + accreal margin); // a margin that is required for the loss to be 0 + +TH_API void THNN_(SoftMarginCriterion_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *target, + THTensor *output, + bool sizeAverage); + +TH_API void THNN_(SoftMarginCriterion_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *target, + THTensor *gradInput, + bool sizeAverage); + +TH_API void THNN_(MSECriterion_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *target, + THTensor *output, + bool sizeAverage); +TH_API void THNN_(MSECriterion_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *target, + THTensor *gradInput, + bool sizeAverage); + +TH_API void THNN_(MultiLabelMarginCriterion_updateOutput)( + THNNState *state, + THTensor *input, + THIndexTensor *target, + THTensor *output, + THTensor *isTarget, + bool sizeAverage); +TH_API void THNN_(MultiLabelMarginCriterion_updateGradInput)( + THNNState *state, + THTensor *input, + THIndexTensor *target, + THTensor *gradInput, + THTensor *isTarget, + bool sizeAverage); + +TH_API void THNN_(MultiMarginCriterion_updateOutput)( + THNNState *state, + THTensor *input, + THIndexTensor *target, + THTensor *output, + bool sizeAverage, + int p, + THTensor* weights, // [OPTIONAL] + accreal margin); +TH_API void THNN_(MultiMarginCriterion_updateGradInput)( + THNNState *state, + THTensor *input, + THIndexTensor *target, + THTensor *gradInput, + bool sizeAverage, + int p, + THTensor *weights, // [OPTIONAL] + accreal margin); + +TH_API void THNN_(PReLU_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THIndex_t nOutputPlane); +TH_API void THNN_(PReLU_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THIndex_t nOutputPlane); +TH_API void THNN_(PReLU_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *gradWeight, + THTensor *gradWeightBuf, + THTensor *gradWeightBuf2, + THIndex_t nOutputPlane, + accreal scale); + +TH_API void THNN_(Linear_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + THTensor *addBuffer); +TH_API void THNN_(Linear_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight); +TH_API void THNN_(Linear_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *bias, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *addBuffer, + accreal scale); + +TH_API void THNN_(RReLU_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *noise, + accreal lower, + accreal upper, + bool train, + bool inplace, + THGenerator *generator); +TH_API void THNN_(RReLU_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *noise, + accreal lower, + accreal upper, + bool train, + bool inplace); + +TH_API void THNN_(Sigmoid_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output); +TH_API void THNN_(Sigmoid_updateGradInput)( + THNNState *state, + THTensor *input, // [OPTIONAL] + THTensor *gradOutput, + THTensor *gradInput, + THTensor *output); + +TH_API void THNN_(SmoothL1Criterion_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *target, + THTensor *output, + bool sizeAverage); +TH_API void THNN_(SmoothL1Criterion_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *target, + THTensor *gradInput, + bool sizeAverage); + +TH_API void THNN_(SoftMax_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output); +TH_API void THNN_(SoftMax_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *output); + +TH_API void THNN_(SoftPlus_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + accreal beta, + accreal threshold); +TH_API void THNN_(SoftPlus_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *output, + accreal beta, + accreal threshold); + +TH_API void THNN_(SoftShrink_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + accreal lambda); +TH_API void THNN_(SoftShrink_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + accreal lambda); + + +TH_API void THNN_(IndexLinear_updateOutput)( + THNNState *state, + THIndexTensor *keys, + long keysOffset, + THTensor *values, + THIndexTensor *sizes, + THIndexTensor *cumSumSizes, + THTensor *output, + THTensor *weight, + THTensor *bias, + THTensor *normalizedValues, + int train); +TH_API void THNN_(IndexLinear_accGradParameters)( + THNNState *state, + THIndexTensor *keys, + long keysOffset, + THTensor *values, + THIndexTensor *sizes, + THIndexTensor *cumSumSizes, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *weight, + THTensor *bias, + THTensor* valuesBuffer, + accreal weightDecay, + accreal scale); +TH_API void THNN_(IndexLinear_accUpdateGradParameters)( + THNNState *state, + THIndexTensor *keys, + long keysOffset, + THTensor *values, + THIndexTensor *sizes, + THIndexTensor *cumSumSizes, + THTensor *gradOutput, + THTensor *weight, + THTensor *bias, + accreal weightDecay, + accreal scale); +TH_API void THNN_(IndexLinear_updateParameters)( + THNNState *state, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *weight, + THTensor *bias, + THIndexTensor *runningKeys, + THIndexTensor *cumSumSizes, + long keysOffset, + accreal weightDecay, + accreal learningRate); + +TH_API void THNN_(SparseLinear_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias); +TH_API void THNN_(SparseLinear_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *weight, + THTensor *bias, + accreal weightDecay, + accreal scale); +TH_API void THNN_(SparseLinear_zeroGradParameters)( + THNNState *state, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *lastInput); +TH_API void THNN_(SparseLinear_updateParameters)( + THNNState *state, + THTensor *weight, + THTensor *bias, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *lastInput, + accreal learningRate); +TH_API void THNN_(SparseLinear_legacyUpdateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias); +TH_API void THNN_(SparseLinear_legacyAccGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *weight, + THTensor *bias, + accreal weightDecay, + accreal scale); +TH_API void THNN_(SparseLinear_legacyZeroGradParameters)( + THNNState *state, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *lastInput); +TH_API void THNN_(SparseLinear_legacyUpdateParameters)( + THNNState *state, + THTensor *weight, + THTensor *bias, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *lastInput, + accreal learningRate); + +TH_API void THNN_(Sqrt_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + accreal eps); +TH_API void THNN_(Sqrt_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *output); + +TH_API void THNN_(Square_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output); +TH_API void THNN_(Square_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput); + +TH_API void THNN_(Tanh_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output); +TH_API void THNN_(Tanh_updateGradInput)( + THNNState *state, + THTensor *input, // [OPTIONAL] + THTensor *gradOutput, + THTensor *gradInput, + THTensor *output); + +TH_API void THNN_(Threshold_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + accreal threshold, + accreal val, + bool inplace); +TH_API void THNN_(Threshold_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + accreal threshold, + accreal val, + bool inplace); + +TH_API void THNN_(TemporalConvolution_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + int kW, int dW, + int inputFrameSize, + int outputFrameSize); +TH_API void THNN_(TemporalConvolution_updateGradInput)( + THNNState* state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + int kW, int dW); +TH_API void THNN_(TemporalConvolution_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + int kW, int dW, + accreal scale); +TH_API void THNN_(TemporalMaxPooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THIndexTensor *indices, + int kW, int dW); +TH_API void THNN_(TemporalMaxPooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THIndexTensor *indices, + int kW, int dW); +TH_API void THNN_(TemporalSubSampling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + int kW, int dW, + int inputFrameSize); +TH_API void THNN_(TemporalSubSampling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + int kW, int dW); +TH_API void THNN_(TemporalSubSampling_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + int kW, int dW, + accreal scale); + +TH_API void THNN_(TemporalRowConvolution_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + THTensor *finput, + THTensor *fgradInput, + int kW, + int dW, + int padW, + bool featFirst); +TH_API void THNN_(TemporalRowConvolution_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *finput, + THTensor *fgradInput, + int kW, + int dW, + int padW, + bool featFirst); +TH_API void THNN_(TemporalRowConvolution_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *finput, + THTensor *fgradInput, + int kW, + int dW, + int padW, + bool featFirst, + accreal scale); + +TH_API void THNN_(BatchNormalization_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, // [OPTIONAL] + THTensor *bias, // [OPTIONAL] + THTensor *running_mean, + THTensor *running_var, + THTensor *save_mean, + THTensor *save_std, + bool train, + double momentum, + double eps); +TH_API void THNN_(BatchNormalization_backward)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, // [OPTIONAL] + THTensor *gradWeight, // [OPTIONAL] + THTensor *gradBias, // [OPTIONAL] + THTensor *weight, // [OPTIONAL] + THTensor *running_mean, + THTensor *running_var, + THTensor *save_mean, + THTensor *save_std, + bool train, + double scale, + double eps); + +TH_API void THNN_(SpatialConvolutionMap_updateOutput)( + THNNState *state, // library state + THTensor *input, // input tensor + THTensor *output, // [OUT] convolution output + THTensor *weight, // 3D weight tensor (connTable:size(1) x kH x kW) + THTensor *bias, // 1D bias tensor (nOutputPlane) + THTensor *connTable, // connection table + int nInputPlane, // number of input planes + int nOutputPlane, // number of output planes + int dW, int dH); // stride +TH_API void THNN_(SpatialConvolutionMap_updateGradInput)( + THNNState *state, // library state + THTensor *input, // input tensor + THTensor *gradOutput, // gradient w.r.t. output + THTensor *gradInput, // [OUT] gradient w.r.t. input + THTensor *weight, // 3D weight tensor (connTable:size(1) x kH x kW) + THTensor *bias, // 1D bias tensor (nOutputPlane) + THTensor *connTable, // connection table + int nInputPlane, // number of input planes + int nOutputPlane, // number of output planes + int dW, int dH); // stride +TH_API void THNN_(SpatialConvolutionMap_accGradParameters)( + THNNState *state, // library state + THTensor *input, // input tensor + THTensor *gradOutput, // gradient w.r.t. output + THTensor *gradWeight, // 3D gradWeight tensor (connTable:size(1) x kH x kW) + THTensor *gradBias, // 1D gradBias tensor (nOutputPlane) + THTensor *connTable, // connection table + int nInputPlane, // number of input planes + int nOutputPlane, // number of output planes + int dW, int dH, // stride + accreal scale); // scaling factor + +TH_API void THNN_(SpatialConvolutionMM_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, // [OPTIONAL] + THTensor *finput, + THTensor *fgradInput, + int kW, int kH, + int dW, int dH, + int padW, int padH); +TH_API void THNN_(SpatialConvolutionMM_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *finput, + THTensor *fgradInput, + int kW, int kH, + int dW, int dH, + int padW, int padH); +TH_API void THNN_(SpatialConvolutionMM_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, // [OPTIONAL] + THTensor *finput, + THTensor *fgradInput, + int kW, int kH, + int dW, int dH, + int padW, int padH, + accreal scale); + +TH_API void THNN_(SpatialDepthWiseConvolution_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, // [OPTIONAL] + THTensor *finput, + THTensor *fgradInput, + int kW, int kH, + int dW, int dH, + int padW, int padH); +TH_API void THNN_(SpatialDepthWiseConvolution_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *finput, + THTensor *fgradInput, + int kW, int kH, + int dW, int dH, + int padW, int padH); +TH_API void THNN_(SpatialDepthWiseConvolution_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, // [OPTIONAL] + THTensor *finput, + THTensor *fgradInput, + int kW, int kH, + int dW, int dH, + int padW, int padH, + accreal scale); + +TH_API void THNN_(SpatialConvolutionLocal_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + THTensor *finput, + THTensor *fgradInput, + int kW, int kH, + int dW, int dH, + int padW, int padH, + long inputWidth, long inputHeight, + long outputWidth, long outputHeight); +TH_API void THNN_(SpatialConvolutionLocal_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *finput, + THTensor *fgradInput, + int kW, int kH, + int dW, int dH, + int padW, int padH, + long inputWidth, long inputHeight, + long outputWidth, long outputHeight); +TH_API void THNN_(SpatialConvolutionLocal_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *finput, + THTensor *fgradInput, + int kW, int kH, + int dW, int dH, + int padW, int padH, + long inputWidth, long inputHeight, + long outputWidth, long outputHeight, + accreal scale); + +TH_API void THNN_(SpatialAdaptiveMaxPooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THIndexTensor *indices, + int owidth, int oheight); +TH_API void THNN_(SpatialAdaptiveMaxPooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THIndexTensor *indices); + +TH_API void THNN_(SpatialAdaptiveAveragePooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int owidth, int oheight); +TH_API void THNN_(SpatialAdaptiveAveragePooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput); + +TH_API void THNN_(SpatialAveragePooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int kW, int kH, + int dW, int dH, + int padW, int padH, + bool ceil_mode, + bool count_include_pad); +TH_API void THNN_(SpatialAveragePooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + int kW, int kH, + int dW, int dH, + int padW, int padH, + bool ceil_mode, + bool count_include_pad); + +TH_API void THNN_(SpatialFractionalMaxPooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int outputW, int outputH, + int poolSizeW, int poolSizeH, + THIndexTensor *indices, + THTensor *randomSamples); +TH_API void THNN_(SpatialFractionalMaxPooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + int outputW, int outputH, + int poolSizeW, int poolSizeH, + THIndexTensor *indices); + +TH_API void THNN_(SpatialFullConvolution_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, // [OPTIONAL] + THTensor *columns, + THTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int adjW, int adjH); +TH_API void THNN_(SpatialFullConvolution_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *gradColumns, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int adjW, int adjH); +TH_API void THNN_(SpatialFullConvolution_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, // [OPTIONAL] + THTensor *columns, + THTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int adjW, int adjH, + accreal scale); + +TH_API void THNN_(SpatialFullConvolutionMap_updateOutput)( + THNNState *state, // library state + THTensor *input, // input tensor + THTensor *output, // [OUT] convolution output + THTensor *weight, // 3D weight tensor (connTable:size(1) x kH x kW) + THTensor *bias, // 1D bias tensor (nOutputPlane) + THTensor *connTable, // connection table + int nInputPlane, // number of input planes + int nOutputPlane, // number of output planes + int dW, int dH); // stride +TH_API void THNN_(SpatialFullConvolutionMap_updateGradInput)( + THNNState *state, // library state + THTensor *input, // input tensor + THTensor *gradOutput, // gradient w.r.t. output + THTensor *gradInput, // [OUT] gradient w.r.t. input + THTensor *weight, // 3D weight tensor (connTable:size(1) x kH x kW) + THTensor *bias, // 1D bias tensor (nOutputPlane) + THTensor *connTable, // connection table + int nInputPlane, // number of input planes + int nOutputPlane, // number of output planes + int dW, int dH); // stride +TH_API void THNN_(SpatialFullConvolutionMap_accGradParameters)( + THNNState *state, // library state + THTensor *input, // input tensor + THTensor *gradOutput, // gradient w.r.t. output + THTensor *gradWeight, // 3D gradWeight tensor (connTable:size(1) x kH x kW) + THTensor *gradBias, // 1D gradBias tensor (nOutputPlane) + THTensor *connTable, // connection table + int nInputPlane, // number of input planes + int nOutputPlane, // number of output planes + int dW, int dH, // stride + accreal scale); // scaling factor + +TH_API void THNN_(SpatialDilatedConvolution_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, // [OPTIONAL] + THTensor *columns, + THTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH); + +TH_API void THNN_(SpatialDilatedConvolution_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *gradColumns, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH); + +TH_API void THNN_(SpatialDilatedConvolution_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, // [OPTIONAL] + THTensor *columns, + THTensor *ones, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH, + accreal scale); + +TH_API void THNN_(SpatialMaxPooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THIndexTensor *indices, + int kW, int kH, + int dW, int dH, + int padW, int padH, + bool ceil_mode); +TH_API void THNN_(SpatialMaxPooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THIndexTensor *indices, + int kW, int kH, + int dW, int dH, + int padW, int padH, + bool ceil_mode); + +TH_API void THNN_(SpatialDilatedMaxPooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THIndexTensor *indices, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH, + bool ceil_mode); +TH_API void THNN_(SpatialDilatedMaxPooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THIndexTensor *indices, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int dilationW, int dilationH, + bool ceil_mode); + +TH_API void THNN_(SpatialMaxUnpooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THIndexTensor *indices, + int owidth, int oheight); +TH_API void THNN_(SpatialMaxUnpooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THIndexTensor *indices, + int owidth, int oheight); + +TH_API void THNN_(SpatialSubSampling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + int kW, int kH, + int dW, int dH); +TH_API void THNN_(SpatialSubSampling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + int kW, int kH, + int dW, int dH); +TH_API void THNN_(SpatialSubSampling_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + int kW, int kH, + int dW, int dH, + accreal scale); + +TH_API void THNN_(SpatialUpSamplingNearest_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int scale_factor); +TH_API void THNN_(SpatialUpSamplingNearest_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + int scale_factor); + +TH_API void THNN_(SpatialUpSamplingBilinear_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int outputHeight, + int outputWidth); +TH_API void THNN_(SpatialUpSamplingBilinear_updateGradInput)( + THNNState *state, + THTensor *gradOutput, + THTensor *gradInput, + int nbatch, + int nchannels, + int inputHeight, + int inputWidth, + int outputHeight, + int outputWidth); + +TH_API void THNN_(unfolded_acc)( + THTensor *finput, + THTensor *input, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int nInputPlane, + int inputWidth, int inputHeight, + int outputWidth, int outputHeight); +TH_API void THNN_(unfolded_copy)( + THTensor *finput, + THTensor *input, + int kW, int kH, + int dW, int dH, + int padW, int padH, + int nInputPlane, + int inputWidth, int inputHeight, + int outputWidth, int outputHeight); + +TH_API void THNN_(VolumetricAveragePooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int kT, int kW, int kH, + int dT, int dW, int dH); +TH_API void THNN_(VolumetricAveragePooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + int kT, int kW, int kH, + int dT, int dW, int dH); + +TH_API void THNN_(VolumetricConvolution_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, // [OPTIONAL] + THTensor *finput, + THTensor *fgradInput, + int dT, int dW, int dH, + int pT, int pW, int pH); +TH_API void THNN_(VolumetricConvolution_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *finput, + int dT, int dW, int dH, + int pT, int pW, int pH); +TH_API void THNN_(VolumetricConvolution_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, // [OPTIONAL] + THTensor *finput, + THTensor *fgradInput, + int dT, int dW, int dH, + int pT, int pW, int pH, + accreal scale); + +TH_API void THNN_(VolumetricConvolutionMM_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, // [OPTIONAL] + THTensor *finput, + int kT, int kW, int kH, + int dT, int dW, int dH, + int pT, int pW, int pH); +TH_API void THNN_(VolumetricConvolutionMM_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *finput, + THTensor *fgradInput, + int kT, int kW, int kH, + int dT, int dW, int dH, + int pT, int pW, int pH); +TH_API void THNN_(VolumetricConvolutionMM_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, // [OPTIONAL] + THTensor *finput, + int kT, int kW, int kH, + int dT, int dW, int dH, + int pT, int pW, int pH, + accreal scale); + +TH_API void THNN_(VolumetricFractionalMaxPooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int outputT, int outputW, int outputH, + int poolSizeT, int poolSizeW, int poolSizeH, + THIndexTensor *indices, + THTensor *randomSamples); +TH_API void THNN_(VolumetricFractionalMaxPooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + int outputT, int outputW, int outputH, + int poolSizeT, int poolSizeW, int poolSizeH, + THIndexTensor *indices); + +TH_API void THNN_(VolumetricFullConvolution_updateOutput)( + THNNState *state, // library state + THTensor *input, // 4D or 5D (batch) tensor + THTensor *output, // [OUT] volumetric convolution output + THTensor *weight, // weight tensor (nInputPlane x nOutputPlane x kT x kH x kW) + THTensor *bias, // [OPTIONAL] gradBias tensor (nOutputPlane) + THTensor *finput, // [OUT] internal columns buffer + THTensor *fgradInput, // [OUT] internal ones buffer + int dT, int dW, int dH, // stride of the convolution + int pT, int pW, int pH, // padding + int aT, int aW, int aH); // extra output adjustment +TH_API void THNN_(VolumetricFullConvolution_updateGradInput)( + THNNState *state, // library state + THTensor *input, // 4D or 5D (batch) tensor + THTensor *gradOutput, // gradient w.r.t. output + THTensor *gradInput, // [OUT] gradient w.r.t. input + THTensor *weight, // weight tensor (nInputPlane x nOutputPlane x kT x kH x kW) + THTensor *finput, // internal columns buffer + THTensor *fgradInput, // internal ones buffer + int dT, int dW, int dH, // stride + int pT, int pW, int pH, // padding + int aT, int aW, int aH); // extra output adjustment +TH_API void THNN_(VolumetricFullConvolution_accGradParameters)( + THNNState *state, // library state + THTensor *input, // 4D or 5D (batch) tensor + THTensor *gradOutput, // gradient w.r.t. output + THTensor *gradWeight, // gradWeight tensor (nInputPlane x nOutputPlane x kT x kH x kW) + THTensor *gradBias, // [OPTIONAL] gradBias tensor (nOutputPlane) + THTensor *finput, // internal columns buffer + THTensor *fgradInput, // internal ones buffer + int dT, int dW, int dH, // stride + int pT, int pW, int pH, // padding + int aT, int aW, int aH, // extra output adjustment + accreal scale); // scaling factor + +TH_API void THNN_(VolumetricDilatedConvolution_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, // [OPTIONAL] + THTensor *columns, + THTensor *ones, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int dilationT, int dilationW, int dilationH); + +TH_API void THNN_(VolumetricDilatedConvolution_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *gradColumns, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int dilationT, int dilationW, int dilationH); + +TH_API void THNN_(VolumetricDilatedConvolution_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, // [OPTIONAL] + THTensor *columns, + THTensor *ones, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int dilationT, int dilationW, int dilationH, + accreal scale); + +TH_API void THNN_(VolumetricMaxPooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THIndexTensor *indices, + int kT, int kW, int kH, + int dT, int dW, int dH, + int pT, int pW, int pH, + bool ceilMode); +TH_API void THNN_(VolumetricMaxPooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THIndexTensor *indices, + int kT, int kW, int kH, + int dT, int dW, int dH, + int pT, int pW, int pH, + bool ceilMode); + +TH_API void THNN_(VolumetricDilatedMaxPooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THIndexTensor *indices, + int kT, int kW, int kH, + int dT, int dW, int dH, + int pT, int pW, int pH, + int dilationT, int dilationW, int dilationH, + bool ceilMode); +TH_API void THNN_(VolumetricDilatedMaxPooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THIndexTensor *indices, + int kT, int kW, int kH, + int dT, int dW, int dH, + int pT, int pW, int pH, + int dilationT, int dilationW, int dilationH, + bool ceilMode); + +TH_API void THNN_(VolumetricMaxUnpooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THIndexTensor *indices, + int oT, int oW, int oH, + int dT, int dW, int dH, + int pT, int pW, int pH); +TH_API void THNN_(VolumetricMaxUnpooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THIndexTensor *indices, + int oT, int oW, int oH, + int dT, int dW, int dH, + int pT, int pW, int pH); + +TH_API void THNN_(SpatialReflectionPadding_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int pad_l, int pad_r, + int pad_t, int pad_b); + +TH_API void THNN_(SpatialReflectionPadding_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + int pad_l, int pad_r, + int pad_t, int pad_b); + +TH_API void THNN_(SpatialReplicationPadding_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int pad_l, int pad_r, + int pad_t, int pad_b); + +TH_API void THNN_(SpatialReplicationPadding_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + int pad_l, int pad_r, + int pad_t, int pad_b); + +TH_API void THNN_(VolumetricReplicationPadding_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int pleft, int pright, + int ptop, int pbottom, + int pfront, int pback); + +TH_API void THNN_(VolumetricReplicationPadding_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + int pleft, int pright, + int ptop, int pbottom, + int pfront, int pback); + +TH_API void THNN_(VolumetricUpSamplingNearest_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int scale_factor); +TH_API void THNN_(VolumetricUpSamplingNearest_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + int scale_factor); + +TH_API void THNN_(VolumetricUpSamplingTrilinear_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int outputDepth, + int outputHeight, + int outputWidth); +TH_API void THNN_(VolumetricUpSamplingTrilinear_updateGradInput)( + THNNState *state, + THTensor *gradOutput, + THTensor *gradInput, + int nbatch, + int nchannels, + int inputDepth, + int inputHeight, + int inputWidth, + int outputDepth, + int outputHeight, + int outputWidth); + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/Tanh.c b/contrib/lua-torch/nn/lib/THNN/generic/Tanh.c new file mode 100644 index 000000000..ecf0708c2 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/Tanh.c @@ -0,0 +1,49 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/Tanh.c" +#else + +void THNN_(Tanh_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output) +{ + THTensor_(tanh)(output, input); +} + +void THNN_(Tanh_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *output) +{ + THNN_CHECK_SHAPE(output, gradOutput); + THTensor_(resizeAs)(gradInput, output); + + if (output->nDimension == 1 || + !THTensor_(isContiguous)(output) || + !THTensor_(isContiguous)(gradOutput) || + !THTensor_(isContiguous)(gradInput)) + { + TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, output, + real z = *output_data; \ + *gradInput_data = *gradOutput_data * (1. - z*z); + ); + } + else + { + real* ptr_gradOutput = THTensor_(data)(gradOutput); + real* ptr_gradInput = THTensor_(data)(gradInput); + real* ptr_output = THTensor_(data)(output); + long i; + +#pragma omp parallel for private(i) + for (i = 0; i < THTensor_(nElement)(gradInput); i++) + { + real z = ptr_output[i]; + ptr_gradInput[i] = ptr_gradOutput[i] * (1. - z*z); + } + } +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/TemporalConvolution.c b/contrib/lua-torch/nn/lib/THNN/generic/TemporalConvolution.c new file mode 100644 index 000000000..8cfd97d85 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/TemporalConvolution.c @@ -0,0 +1,398 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/TemporalConvolution.c" +#else + +static inline void THNN_(TemporalConvolution_shapeCheck)( + THNNState *state, + THTensor *input, + int kW, + int dW, + int *inputFrameSize) { + + THArgCheck(kW > 0, 9, + "kernel size should be greater than zero, but got kW: %d", kW); + THArgCheck(dW > 0, 11, + "stride should be greater than zero, but got dW: %d", dW); + + int dimS = 0; // sequence dimension + int dimF = 1; // feature dimension + + if (input->nDimension == 3) + { + dimS = 1; + dimF = 2; + } + THNN_ARGCHECK(input->nDimension == 2 || input->nDimension == 3, 2, input, + "2D or 3D (batch mode) tensor expected for input, but got: %s"); + if (inputFrameSize != NULL) { + THArgCheck(input->size[dimF] == *inputFrameSize, 2, + "invalid input frame size. Got: %d, Expected: %d", + input->size[dimF], *inputFrameSize); + } + THArgCheck(input->size[dimS] >= kW, 2, + "input sequence smaller than kernel size. Got: %d, Expected: %d", + input->size[dimS], kW); +} + +void THNN_(TemporalConvolution_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + int kW, + int dW, + int inputFrameSize, + int outputFrameSize) +{ + THTensor *outputWindow, *inputWindow; + int nInputFrame, nOutputFrame; + long k, i; + + int dimS = 0; // sequence dimension + int dimF = 1; // feature dimension + + if (input->nDimension == 3) + { + dimS = 1; + dimF = 2; + } + + THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous"); + THArgCheck(!bias || THTensor_(isContiguous)(bias), 5, "bias must be contiguous"); + THNN_(TemporalConvolution_shapeCheck) + (state, input, kW, dW, &inputFrameSize); + input = THTensor_(newContiguous)(input); + outputWindow = THTensor_(new)(); + inputWindow = THTensor_(new)(); + + nInputFrame = input->size[dimS]; + nOutputFrame = (nInputFrame - kW) / dW + 1; + + if (input->nDimension == 2) + { + THTensor_(resize2d)(output, + nOutputFrame, + outputFrameSize); + + /* bias first */ + for(k = 0; k < nOutputFrame; k++) + { + THTensor_(select)(outputWindow, output, 0, k); + THTensor_(copy)(outputWindow, bias); + } + + /* ouch */ + for(k = 0; nOutputFrame > 0; k++) + { + long outputFrameStride = (kW-1)/dW+1; + long inputFrameStride = outputFrameStride*dW; + long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1; + nOutputFrame -= nFrame; + + THTensor_(setStorage2d)(inputWindow, input->storage, + input->storageOffset+k*dW*input->size[1], + nFrame, inputFrameStride*input->size[1], + kW*input->size[1], 1); + + THTensor_(setStorage2d)(outputWindow, output->storage, + output->storageOffset + k*output->size[1], + nFrame, outputFrameStride*output->size[1], + output->size[1], 1); + + THTensor *tweight = THTensor_(new)(); + THTensor_(transpose)(tweight, weight, 0, 1); + THTensor_(addmm)(outputWindow, 1, outputWindow, 1, inputWindow, tweight); + THTensor_(free)(tweight); + } + } + else + { + THTensor *outputSample = THTensor_(new)(); + THTensor *inputSample = THTensor_(new)(); + int nBatchFrame = input->size[0]; + + THTensor_(resize3d)(output, + nBatchFrame, + nOutputFrame, + outputFrameSize); + + for(i = 0; i < nBatchFrame; i++) + { + THTensor_(select)(outputSample, output, 0, i); + THTensor_(select)(inputSample, input, 0, i); + long nOutputSampleFrame = nOutputFrame; + + /* bias first */ + for(k = 0; k < nOutputFrame; k++) + { + THTensor_(select)(outputWindow, outputSample, 0, k); + THTensor_(copy)(outputWindow, bias); + } + + /* ouch */ + for(k = 0; nOutputSampleFrame > 0; k++) + { + long outputFrameStride = (kW-1)/dW+1; + long inputFrameStride = outputFrameStride*dW; + long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1; + nOutputSampleFrame -= nFrame; + + THTensor_(setStorage2d)(inputWindow, inputSample->storage, + inputSample->storageOffset+k*dW*inputSample->size[1], + nFrame, inputFrameStride*inputSample->size[1], + kW*inputSample->size[1], 1); + + THTensor_(setStorage2d)(outputWindow, outputSample->storage, + outputSample->storageOffset + k*outputSample->size[1], + nFrame, outputFrameStride*outputSample->size[1], + outputSample->size[1], 1); + + THTensor *tweight = THTensor_(new)(); + THTensor_(transpose)(tweight, weight, 0, 1); + THTensor_(addmm)(outputWindow, 1, outputWindow, 1, inputWindow, tweight); + THTensor_(free)(tweight); + } + } + THTensor_(free)(outputSample); + THTensor_(free)(inputSample); + } + + THTensor_(free)(outputWindow); + THTensor_(free)(inputWindow); + THTensor_(free)(input); + +} + +void THNN_(TemporalConvolution_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + int kW, + int dW) +{ + long nInputFrame; + long nOutputFrame; + + THTensor *gradOutputWindow; + THTensor *gradInputWindow; + long k, i; + + int dimS = 0; // sequence dimension + int dimF = 1; // feature dimension + + if (gradOutput->nDimension == 3) + { + dimS = 1; + dimF = 2; + } + + THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous"); + THNN_(TemporalConvolution_shapeCheck)( + state, input, kW, dW, NULL); + nInputFrame = input->size[dimS]; + nOutputFrame = gradOutput->size[dimS]; + + input = THTensor_(newContiguous)(input); + gradOutput = THTensor_(newContiguous)(gradOutput); + + gradOutputWindow = THTensor_(new)(); + gradInputWindow = THTensor_(new)(); + + THTensor_(resizeAs)(gradInput, input); + THTensor_(zero)(gradInput); + + if (gradOutput->nDimension == 2) + { + /* ouch */ + for(k = 0; nOutputFrame > 0; k++) + { + long outputFrameStride = (kW-1)/dW+1; + long inputFrameStride = outputFrameStride*dW; + long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1; + nOutputFrame -= nFrame; + + THTensor_(setStorage2d)(gradOutputWindow, gradOutput->storage, + gradOutput->storageOffset + k*gradOutput->size[1], + nFrame, outputFrameStride*gradOutput->size[1], + gradOutput->size[1], 1); + + THTensor_(setStorage2d)(gradInputWindow, gradInput->storage, + gradInput->storageOffset+k*dW*gradInput->size[1], + nFrame, inputFrameStride*gradInput->size[1], + kW*gradInput->size[1], 1); + + THTensor_(addmm)(gradInputWindow, 1, gradInputWindow, 1, gradOutputWindow, weight); + } + } + else + { + THTensor *gradOutputSample = THTensor_(new)(); + THTensor *gradInputSample = THTensor_(new)(); + int nBatchFrame = input->size[0]; + + for(i = 0; i < nBatchFrame; i++) + { + THTensor_(select)(gradOutputSample, gradOutput, 0, i); + THTensor_(select)(gradInputSample, gradInput, 0, i); + int nOutputSampleFrame = nOutputFrame; + + /* ouch */ + for(k = 0; nOutputSampleFrame > 0; k++) + { + long outputFrameStride = (kW-1)/dW+1; + long inputFrameStride = outputFrameStride*dW; + long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1; + nOutputSampleFrame -= nFrame; + + THTensor_(setStorage2d)(gradOutputWindow, gradOutputSample->storage, + gradOutputSample->storageOffset + k*gradOutputSample->size[1], + nFrame, outputFrameStride*gradOutputSample->size[1], + gradOutputSample->size[1], 1); + + THTensor_(setStorage2d)(gradInputWindow, gradInputSample->storage, + gradInputSample->storageOffset+k*dW*gradInputSample->size[1], + nFrame, inputFrameStride*gradInputSample->size[1], + kW*gradInputSample->size[1], 1); + + THTensor_(addmm)(gradInputWindow, 1, gradInputWindow, 1, gradOutputWindow, weight); + } + } + THTensor_(free)(gradOutputSample); + THTensor_(free)(gradInputSample); + } + + THTensor_(free)(gradOutputWindow); + THTensor_(free)(gradInputWindow); + THTensor_(free)(gradOutput); + THTensor_(free)(input); + +} + +void THNN_(TemporalConvolution_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + int kW, + int dW, + accreal scale_) +{ + real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); + long nInputFrame; + long nOutputFrame; + + THTensor *gradOutputWindow; + THTensor *inputWindow; + long k, i; + + int dimS = 0; // sequence dimension + int dimF = 1; // feature dimension + + if (gradOutput->nDimension == 3) + { + dimS = 1; + dimF = 2; + } + + THNN_(TemporalConvolution_shapeCheck)( + state, input, kW, dW, NULL); + nInputFrame = input->size[dimS]; + nOutputFrame = gradOutput->size[dimS]; + + input = THTensor_(newContiguous)(input); + gradOutput = THTensor_(newContiguous)(gradOutput); + gradOutputWindow = THTensor_(new)(); + inputWindow = THTensor_(new)(); + + if (input->nDimension == 2) + { + /* bias first */ + for(k = 0; k < nOutputFrame; k++) + { + THTensor_(select)(gradOutputWindow, gradOutput, 0, k); + THTensor_(cadd)(gradBias, gradBias, scale, gradOutputWindow); + } + + /* ouch */ + for(k = 0; nOutputFrame > 0; k++) + { + long outputFrameStride = (kW-1)/dW+1; + long inputFrameStride = outputFrameStride*dW; + long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1; + nOutputFrame -= nFrame; + + THTensor_(setStorage2d)(inputWindow, input->storage, + input->storageOffset+k*dW*input->size[1], + nFrame, inputFrameStride*input->size[1], + kW*input->size[1], 1); + + THTensor_(setStorage2d)(gradOutputWindow, gradOutput->storage, + gradOutput->storageOffset + k*gradOutput->size[1], + nFrame, outputFrameStride*gradOutput->size[1], + gradOutput->size[1], 1); + + THTensor *tgradOutputWindow = THTensor_(new)(); + THTensor_(transpose)(tgradOutputWindow, gradOutputWindow, 0, 1); + THTensor_(addmm)(gradWeight, 1, gradWeight, scale, tgradOutputWindow, inputWindow); + THTensor_(free)(tgradOutputWindow); + } + } + else + { + THTensor *gradOutputSample = THTensor_(new)(); + THTensor *inputSample = THTensor_(new)(); + int nBatchFrame = input->size[0]; + + for(i = 0; i < nBatchFrame; i++) + { + THTensor_(select)(gradOutputSample, gradOutput, 0, i); + THTensor_(select)(inputSample, input, 0, i); + int nOutputSampleFrame = nOutputFrame; + + /* bias first */ + for(k = 0; k < nOutputFrame; k++) + { + THTensor_(select)(gradOutputWindow, gradOutputSample, 0, k); + THTensor_(cadd)(gradBias, gradBias, scale, gradOutputWindow); + } + + /* ouch */ + for(k = 0; nOutputSampleFrame > 0; k++) + { + long outputFrameStride = (kW-1)/dW+1; + long inputFrameStride = outputFrameStride*dW; + long nFrame = (nInputFrame-k*dW-kW)/inputFrameStride + 1; + nOutputSampleFrame -= nFrame; + + THTensor_(setStorage2d)(inputWindow, inputSample->storage, + inputSample->storageOffset+k*dW*inputSample->size[1], + nFrame, inputFrameStride*inputSample->size[1], + kW*inputSample->size[1], 1); + + THTensor_(setStorage2d)(gradOutputWindow, gradOutputSample->storage, + gradOutputSample->storageOffset + k*gradOutputSample->size[1], + nFrame, outputFrameStride*gradOutputSample->size[1], + gradOutputSample->size[1], 1); + + THTensor *tgradOutputWindow = THTensor_(new)(); + THTensor_(transpose)(tgradOutputWindow, gradOutputWindow, 0, 1); + THTensor_(addmm)(gradWeight, 1, gradWeight, scale, tgradOutputWindow, inputWindow); + THTensor_(free)(tgradOutputWindow); + } + } + THTensor_(free)(gradOutputSample); + THTensor_(free)(inputSample); + } + + THTensor_(free)(gradOutputWindow); + THTensor_(free)(inputWindow); + THTensor_(free)(gradOutput); + THTensor_(free)(input); + +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/TemporalMaxPooling.c b/contrib/lua-torch/nn/lib/THNN/generic/TemporalMaxPooling.c new file mode 100644 index 000000000..344c1b3fd --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/TemporalMaxPooling.c @@ -0,0 +1,283 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/TemporalMaxPooling.c" +#else + +static inline void THNN_(TemporalMaxPooling_shapeCheck)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THIndexTensor *indices, + int kW, + int dW) { + long niframe; + long framesize; + long noframe; + + int dimS = 0; // sequence dimension + int dimF = 1; // feature dimension + int ndims = input->nDimension; + + if (input->nDimension == 3) + { + dimS = 1; + dimF = 2; + } + + niframe = input->size[dimS]; + framesize = input->size[dimF]; + noframe = (niframe - kW) / dW + 1; + + THArgCheck(kW > 0, 5, + "kernel size should be greater than zero, but got kW: %d", kW); + THArgCheck(dW > 0, 6, + "stride should be greater than zero, but got dW: %d", dW); + + THNN_ARGCHECK(input->nDimension == 2 || input->nDimension == 3, 2, input, + "2D or 3D (batch mode) tensor expected for input, but got: %s"); + THArgCheck(input->size[dimS] >= kW, 2, + "input sequence smaller than kernel size. Got: %d, Expected: %d", + input->size[dimS], kW); + + if (gradOutput != NULL) { + THNN_CHECK_DIM_SIZE(gradOutput, ndims, dimS, noframe); + THNN_CHECK_DIM_SIZE(gradOutput, ndims, dimF, framesize) + } + if (indices != NULL) { + THNN_CHECK_DIM_SIZE_INDICES(indices, ndims, dimS, noframe); + THNN_CHECK_DIM_SIZE_INDICES(indices, ndims, dimF, framesize); + } +} + +void THNN_(TemporalMaxPooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THIndexTensor *indices, + int kW, + int dW) +{ + long niframe; + long framesize; + long noframe; + + real *input_data; + real *output_data; + THIndex_t *indices_data; + + long t, y; + + int dimS = 0; // sequence dimension + int dimF = 1; // feature dimension + + THNN_(TemporalMaxPooling_shapeCheck)(state, input, NULL, NULL, kW, dW); + + if (input->nDimension == 3) + { + dimS = 1; + dimF = 2; + } + + /* sizes */ + niframe = input->size[dimS]; + framesize = input->size[dimF]; + noframe = (niframe - kW) / dW + 1; + + /* get contiguous input */ + input = THTensor_(newContiguous)(input); + + if (input->nDimension == 2) + { + /* resize output */ + THTensor_(resize2d)(output, noframe, framesize); + + /* indices will contain index locations for each output point */ + THIndexTensor_(resize2d)(indices, noframe, framesize); + + /* get raw pointers */ + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + indices_data = THIndexTensor_(data)(indices); + + for(t = 0; t < noframe; t++) + { + real *ip = input_data + t*framesize*dW; + real *op = output_data + t*framesize; + THIndex_t *xp = indices_data + t*framesize; +#pragma omp parallel for private(y) + for(y = 0; y < framesize; y++) + { + /* compute local max: */ + long maxindex = -1; + real maxval = -THInf; + long x; + for(x = 0; x < kW; x++) + { + real val = ip[x*framesize+y]; + if (val > maxval) + { + maxval = val; + maxindex = x; + } + } + + /* set output to local max */ + op[y] = maxval; + xp[y] = (real)maxindex; + } + } + } + else + { + /* number of batch frames */ + long nbframe = input->size[0]; + long i; + + /* resize output */ + THTensor_(resize3d)(output, nbframe, noframe, framesize); + + /* indices will contain index locations for each output point */ + THIndexTensor_(resize3d)(indices, nbframe, noframe, framesize); + + /* get raw pointers */ + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + indices_data = THIndexTensor_(data)(indices); + + for(i = 0; i < nbframe; i++) + { + real *inputSample_data = input_data + i*niframe*framesize; + real *outputSample_data = output_data + i*noframe*framesize; + THIndex_t *indicesSample_data = indices_data + i*noframe*framesize; + + for(t = 0; t < noframe; t++) + { + real *ip = inputSample_data + t*framesize*dW; + real *op = outputSample_data + t*framesize; + THIndex_t *xp = indicesSample_data + t*framesize; + +#pragma omp parallel for private(y) + for(y = 0; y < framesize; y++) + { + /* compute local max: */ + long maxindex = -1; + real maxval = -THInf; + long x; + for(x = 0; x < kW; x++) + { + real val = ip[x*framesize+y]; + if (val > maxval) + { + maxval = val; + maxindex = x; + } + } + + /* set output to local max */ + op[y] = maxval; + xp[y] = (real)maxindex; + } + } + } + } + + /* cleanup */ + THTensor_(free)(input); + +} + +void THNN_(TemporalMaxPooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THIndexTensor *indices, + int kW, + int dW) +{ + long niframe; + int noframe; + long framesize; + + real *gradInput_data; + real *gradOutput_data; + THIndex_t *indices_data; + + long t, y; + + THNN_(TemporalMaxPooling_shapeCheck)(state, input, gradOutput, indices, kW, dW); + /* get contiguous gradOutput */ + gradOutput = THTensor_(newContiguous)(gradOutput); + + /* resize and zero */ + THTensor_(resizeAs)(gradInput, input); + THTensor_(zero)(gradInput); + + int dimS = 0; // sequence dimension + int dimF = 1; // feature dimension + + if (input->nDimension == 3) + { + dimS = 1; + dimF = 2; + } + /* sizes */ + niframe = input->size[dimS]; + noframe = gradOutput->size[dimS]; + framesize = gradOutput->size[dimF]; + + /* get raw pointers */ + gradInput_data = THTensor_(data)(gradInput); + gradOutput_data = THTensor_(data)(gradOutput); + indices_data = THIndexTensor_(data)(indices); + + if (input->nDimension == 2) + { + for(t = 0; t < noframe; t++) + { + real *gip = gradInput_data + t*framesize*dW; + real *gop = gradOutput_data + t*framesize; + THIndex_t *xp = indices_data + t*framesize; +#pragma omp parallel for private(y) + for(y = 0; y < framesize; y++) + { + /* compute local max: */ + long maxindex = (long)xp[y]; + if (maxindex != -1) + gip[maxindex*framesize+y] += gop[y]; + } + } + } + else + { + /* number of batch frames */ + long nbframe = input->size[0]; + long i; + + for(i = 0; i < nbframe; i++) + { + real *gradInputSample_data = gradInput_data + i*niframe*framesize; + real *gradOutputSample_data = gradOutput_data + i*noframe*framesize; + THIndex_t *indicesSample_data = indices_data + i*noframe*framesize; + + for(t = 0; t < noframe; t++) + { + real *gip = gradInputSample_data + t*framesize*dW; + real *gop = gradOutputSample_data + t*framesize; + THIndex_t *xp = indicesSample_data + t*framesize; +#pragma omp parallel for private(y) + for(y = 0; y < framesize; y++) + { + /* compute local max: */ + long maxindex = (long)xp[y]; + if (maxindex != -1) + gip[maxindex*framesize+y] += gop[y]; + } + } + } + } + + /* cleanup */ + THTensor_(free)(gradOutput); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/TemporalRowConvolution.c b/contrib/lua-torch/nn/lib/THNN/generic/TemporalRowConvolution.c new file mode 100644 index 000000000..e3ae41e22 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/TemporalRowConvolution.c @@ -0,0 +1,472 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/TemporalRowConvolution.c" +#else + +static inline void THNN_(TemporalRowConvolution_shapeCheck)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *weight, + THTensor *bias, + int kW, + int dW, + int padW) { + + THArgCheck(kW > 0, 5, + "kernel size should be greater than zero, but got kW: %d", kW); + THArgCheck(dW > 0, 6, + "stride should be greater than zero, but got dW: %d", dW); + THNN_ARGCHECK(weight->nDimension == 3, 3, weight, + "3D weight tensor expected, but got: %s"); + THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous"); + THArgCheck(!bias || THTensor_(isContiguous)(bias), 5, "bias must be contiguous"); + + if (bias != NULL) { + THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]); + } + + // we're always looking at (possibly batch) x feats x seq + int ndim = input->nDimension; + int dimF = 0; + int dimS = 1; + + if (ndim == 3) { + ++dimS; + ++dimF; + } + + THNN_ARGCHECK(ndim == 2 || ndim == 3, 1, input, + "2D or 3D (batch mode) input tensor expected, but got :%s"); + + long inputFrameSize = weight->size[0]; + long nInputFrame = input->size[dimS]; + long nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1; + + if (nOutputFrame < 1) { + THError("Given input size: (%d x %d). " + "Calculated output size: (%d x %d). Output size is too small", + inputFrameSize, nInputFrame, inputFrameSize, nOutputFrame); + } + + THNN_CHECK_DIM_SIZE(input, ndim, dimF, inputFrameSize); + + if (gradOutput != NULL) { + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimF, inputFrameSize); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimS, nOutputFrame); + } +} + +static void THNN_(unfolded_acc_row)( + THTensor *finput, + THTensor *input, + int kW, + int dW, + int padW, + long inputFrameSize, + long nInputFrame, + long nOutputFrame) { + + size_t c; + real *input_data = THTensor_(data)(input); + real *finput_data = THTensor_(data)(finput); + +// #pragma omp parallel for private(c) + for (c = 0; c < inputFrameSize; c++) { + size_t kw, x; + long long ix = 0; + + for (kw = 0; kw < kW; kw++) { + real *src = finput_data + + c * (kW * nOutputFrame) + + kw * (nOutputFrame); + real *dst = input_data + c * (nInputFrame); + + ix = (long long)(kw); + if (dW == 1) { + real *dst_slice = dst + (size_t)(ix); + THVector_(cadd)(dst_slice, dst_slice, src, 1, nOutputFrame); + } else { + for (x = 0; x < nOutputFrame; x++) { + real *dst_slice = dst + (size_t)(ix + x * dW); + THVector_(cadd)(dst_slice, dst_slice, + src + (size_t)(x), 1, 1); + } + } + } + } +} + +static void THNN_(unfolded_copy_row)( + THTensor *finput, + THTensor *input, + int kW, + int dW, + int padW, + long inputFrameSize, + long nInputFrame, + long nOutputFrame) { + + long k; + real *input_data = THTensor_(data)(input); + real *finput_data = THTensor_(data)(finput); + +// #pragma omp parallel for private(k) + for (k = 0; k < inputFrameSize * kW; k++) { + size_t c = k / kW; + size_t rest = k % kW; + size_t kw = rest % kW; + size_t x; + long long ix; + real *dst = finput_data + c * (kW * nOutputFrame) + kw * (nOutputFrame); + real *src = input_data + c * (nInputFrame); + + ix = (long long)(kw); + if (dW == 1) { + memcpy(dst, src+(size_t)(ix), sizeof(real) * (nOutputFrame)); + } else { + for (x = 0; x < nOutputFrame; x++) { + memcpy(dst + (size_t)(x), src + (size_t)(ix + x * dW), + sizeof(real) * 1); + } + } + } +} + +static void THNN_(TemporalRowConvolution_updateOutput_frame)( + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + THTensor *finput, + int kW, + int dW, + int padW, + long inputFrameSize, + long nInputFrame, + long nOutputFrame) { + + long i; + + THTensor *output3d = THTensor_(newWithStorage3d)( + output->storage, output->storageOffset, + inputFrameSize, -1, + 1, -1, + nOutputFrame, -1); + + THNN_(unfolded_copy_row)(finput, input, kW, dW, padW, + inputFrameSize, nInputFrame, nOutputFrame); + + THTensor_(zero)(output); + + if (bias != NULL) { + for (i = 0; i < inputFrameSize; i++) + THVector_(fill) + (output->storage->data + output->storageOffset + + output->stride[0] * i, + THTensor_(get1d)(bias, i), nOutputFrame); + } + + THTensor_(baddbmm)(output3d, 1, output3d, 1, weight, finput); + + THTensor_(free)(output3d); +} + +void THNN_(TemporalRowConvolution_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + THTensor *finput, + THTensor *fgradInput, // unused here but needed for Cuda + int kW, + int dW, + int padW, + bool featFirst) { + + int ndim = input->nDimension; + + THTensor *tinput; + if (!featFirst) { + tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2); + input = THTensor_(newContiguous)(tinput); + } else { + input = THTensor_(newContiguous)(input); + } + + THNN_(TemporalRowConvolution_shapeCheck)( + state, input, NULL, weight, bias, kW, dW, padW); + + long inputFrameSize = weight->size[0]; + long nInputFrame = input->size[ndim - 1]; + long nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1; + + if (ndim == 2) { /* non-batch mode */ + + THTensor_(resize3d)(finput, inputFrameSize, kW, nOutputFrame); + THTensor_(resize2d)(output, inputFrameSize, nOutputFrame); + + THTensor_(zero)(finput); + THTensor_(zero)(output); + + THNN_(TemporalRowConvolution_updateOutput_frame) + (input, output, weight, bias, finput, + kW, dW, padW, + inputFrameSize, nInputFrame, nOutputFrame); + + } else { + long T = input->size[0]; + long t; + + THTensor_(resize4d)(finput, T, inputFrameSize, kW, nOutputFrame); + THTensor_(resize3d)(output, T, inputFrameSize, nOutputFrame); + + THTensor_(zero)(finput); + THTensor_(zero)(output); + +#pragma omp parallel for private(t) + for (t = 0; t < T; t++) { + THTensor *input_t = THTensor_(newSelect)(input, 0, t); + THTensor *output_t = THTensor_(newSelect)(output, 0, t); + THTensor *finput_t = THTensor_(newSelect)(finput, 0, t); + + THNN_(TemporalRowConvolution_updateOutput_frame) + (input_t, output_t, weight, bias, finput_t, + kW, dW, padW, inputFrameSize, nInputFrame, nOutputFrame); + + THTensor_(free)(input_t); + THTensor_(free)(output_t); + THTensor_(free)(finput_t); + } + } + + if (!featFirst) { // NOTE: output will NOT be contiguous in this case + THTensor_(transpose)(output, output, ndim - 1, ndim - 2); + THTensor_(free)(tinput); + } + + THTensor_(free)(input); +} + +static void THNN_(TemporalRowConvolution_updateGradInput_frame)( + THTensor *gradInput, + THTensor *gradOutput, + THTensor *weight, + THTensor *fgradInput, + int kW, + int dW, + int padW, + long inputFrameSize, + long nInputFrame, + long nOutputFrame) { + + THTensor *gradOutput3d = THTensor_(newWithStorage3d)( + gradOutput->storage, gradOutput->storageOffset, + inputFrameSize, -1, + 1, -1, + nOutputFrame, -1); + + // weight: inputFrameSize x kW x 1 + // gradOutput3d: inputFrameSize x 1 x nOutputFrame + THTensor_(baddbmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput3d); + // fgradInput: inputFrameSize x kW x nOutputFrame + THTensor_(free)(gradOutput3d); + + THTensor_(zero)(gradInput); + + THNN_(unfolded_acc_row)(fgradInput, gradInput, + kW, dW, padW, + inputFrameSize, nInputFrame, nOutputFrame); +} + +void THNN_(TemporalRowConvolution_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *finput, + THTensor *fgradInput, + int kW, + int dW, + int padW, + bool featFirst) { + + int ndim = input->nDimension; + + THTensor *tinput, *tgradOutput; + + if (!featFirst) { + tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2); + tgradOutput = THTensor_(newTranspose)(gradOutput, ndim - 1, ndim - 2); + + input = THTensor_(newContiguous)(tinput); + gradOutput = THTensor_(newContiguous)(tgradOutput); + + } else { + input = THTensor_(newContiguous)(input); + gradOutput = THTensor_(newContiguous)(gradOutput); + } + + THNN_(TemporalRowConvolution_shapeCheck)(state, input, gradOutput, weight, + NULL, kW, dW, padW); + + long inputFrameSize = weight->size[0]; + long nInputFrame = input->size[ndim - 1]; + long nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1; + + THTensor_(resizeAs)(fgradInput, finput); + THTensor_(resizeAs)(gradInput, input); + + THTensor_(zero)(fgradInput); + THTensor_(zero)(gradInput); + + THTensor *tweight = THTensor_(new)(); + THTensor_(transpose)(tweight, weight, 1, 2); + + if (ndim == 2) { + THNN_(TemporalRowConvolution_updateGradInput_frame) + (gradInput, gradOutput, tweight, fgradInput, + kW, dW, padW, + inputFrameSize, nInputFrame, nOutputFrame); + } else { + long T = input->size[0]; + long t; + +#pragma omp parallel for private(t) + for (t = 0; t < T; t++) { + + THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t); + THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t); + THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t); + + THNN_(TemporalRowConvolution_updateGradInput_frame) + (gradInput_t, gradOutput_t, tweight, fgradInput_t, + kW, dW, padW, + inputFrameSize, nInputFrame, nOutputFrame); + + THTensor_(free)(gradInput_t); + THTensor_(free)(gradOutput_t); + THTensor_(free)(fgradInput_t); + } + } + + THTensor_(free)(tweight); + + if (!featFirst) { // NOTE: gradInput will NOT be contiguous in this case + + THTensor_(free)(tinput); + THTensor_(free)(tgradOutput); + + THTensor_(transpose)(gradInput, gradInput, ndim - 1, ndim - 2); + } + + THTensor_(free)(input); + THTensor_(free)(gradOutput); + +} + +static void THNN_(TemporalRowConvolution_accGradParameters_frame)( + THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, + THTensor *finput, real scale) { + + long i; + THTensor *gradOutput3d = THTensor_(newWithStorage3d)( + gradOutput->storage, gradOutput->storageOffset, + gradOutput->size[0], -1, + 1, -1, + gradOutput->size[1], -1); + + THTensor *tfinput = THTensor_(new)(); + THTensor_(transpose)(tfinput, finput, 1, 2); + // gradOutput3d: inputFrameSize x 1 x nOutputFrame + // finput: inputFrameSize x nOutputFrame x kW + THTensor_(baddbmm)(gradWeight, 1, gradWeight, scale, gradOutput3d, tfinput); + // gradWeight: inputFrameSize x 1 x kW + THTensor_(free)(tfinput); + + if (gradBias != NULL) { + for (i = 0; i < gradBias->size[0]; i++) { + long k; + real sum = 0; + real *data = gradOutput3d->storage->data + + gradOutput3d->storageOffset + + i * gradOutput3d->stride[0]; + for (k = 0; k < gradOutput3d->size[2]; k++) { + sum += data[k]; + } + (gradBias->storage->data + gradBias->storageOffset)[i] + += scale * sum; + } + } + + THTensor_(free)(gradOutput3d); + +} + +void THNN_(TemporalRowConvolution_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *finput, + THTensor *fgradInput, + int kW, + int dW, + int padW, + bool featFirst, + accreal scale_) { + + real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); + int ndim = input->nDimension; + + THTensor *tinput, *tgradOutput; + + if (!featFirst) { + tinput = THTensor_(newTranspose)(input, ndim - 1, ndim - 2); + tgradOutput = THTensor_(newTranspose)(gradOutput, ndim - 1, ndim - 2); + + input = THTensor_(newContiguous)(tinput); + gradOutput = THTensor_(newContiguous)(tgradOutput); + } else { + input = THTensor_(newContiguous)(input); + gradOutput = THTensor_(newContiguous)(gradOutput); + } + + THNN_(TemporalRowConvolution_shapeCheck) + (state, input, gradOutput, gradWeight, gradBias, kW, dW, padW); + + long inputFrameSize = gradWeight->size[0]; + long nInputFrame = input->size[ndim - 1]; + long nOutputFrame = (nInputFrame + 2 * padW - kW) / dW + 1; + + if (ndim == 2) { + THNN_(TemporalRowConvolution_accGradParameters_frame)( + gradOutput, gradWeight, gradBias, finput, scale); + } else { + long T = input->size[0]; + long t; + + for (t = 0; t < T; t++) { + THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t); + THTensor *finput_t = THTensor_(newSelect)(finput, 0, t); + + THNN_(TemporalRowConvolution_accGradParameters_frame)( + gradOutput_t, gradWeight, gradBias, finput_t, scale); + + THTensor_(free)(gradOutput_t); + THTensor_(free)(finput_t); + } + } + + if (!featFirst) { + THTensor_(free)(tinput); + THTensor_(free)(tgradOutput); + } + + THTensor_(free)(input); + THTensor_(free)(gradOutput); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/TemporalSubSampling.c b/contrib/lua-torch/nn/lib/THNN/generic/TemporalSubSampling.c new file mode 100644 index 000000000..68f35e28a --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/TemporalSubSampling.c @@ -0,0 +1,156 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/TemporalSubSampling.c" +#else + +static inline void THNN_(TemporalSubSampling_shapeCheck)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + int kW, + int dW, + int *inputFrameSize) { + int nInputFrame, nOutputFrame; + + THArgCheck(kW > 0, 6, + "kernel size should be greater than zero, but got kW: %d", kW); + THArgCheck(dW > 0, 7, + "stride should be greater than zero, but got dW: %d", dW); + + THNN_ARGCHECK(input->nDimension == 2, 2, input, + "2D or 3D (batch mode) tensor expected for input, but got: %s"); + if (inputFrameSize != NULL) { + THArgCheck( input->size[1] == *inputFrameSize, 2, + "invalid input frame size. Got: %d, Expected: %d", + input->size[1], *inputFrameSize); + } + THArgCheck( input->size[0] >= kW, 2, + "input sequence smaller than kernel size. Got %d, Expected: %d", + input->size[0], kW); + + nInputFrame = input->size[0]; + nOutputFrame = (nInputFrame - kW) / dW + 1; + + if (gradOutput != NULL) { + THNN_CHECK_DIM_SIZE(gradOutput, input->nDimension, 0, nOutputFrame); + if (inputFrameSize != NULL) { + THNN_CHECK_DIM_SIZE(gradOutput, input->nDimension, 1, *inputFrameSize); + } + } +} + +void THNN_(TemporalSubSampling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + int kW, + int dW, + int inputFrameSize) +{ + THTensor *outputFrame, *inputWindow; + int nInputFrame, nOutputFrame; + long k; + + THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous"); + THArgCheck(!bias || THTensor_(isContiguous)(bias), 4, "bias must be contiguous"); + THNN_(TemporalSubSampling_shapeCheck)(state, input, NULL, kW, dW, &inputFrameSize); + + outputFrame = THTensor_(new)(); + inputWindow = THTensor_(new)(); + + nInputFrame = input->size[0]; + nOutputFrame = (nInputFrame - kW) / dW + 1; + + THTensor_(resize2d)(output, + nOutputFrame, + inputFrameSize); + + for(k = 0; k < nOutputFrame; k++) + { + THTensor_(narrow)(inputWindow, input, 0, k*dW, kW); + THTensor_(select)(outputFrame, output, 0, k); + THTensor_(sum)(outputFrame, inputWindow, 0, 1); + THTensor_(cmul)(outputFrame, outputFrame, weight); + THTensor_(cadd)(outputFrame, outputFrame, 1, bias); + } + + THTensor_(free)(outputFrame); + THTensor_(free)(inputWindow); +} + +void THNN_(TemporalSubSampling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + int kW, + int dW) +{ + + THTensor *gradOutputFrame; + THTensor *gradInputWindow, *buffer, *kwunit; + long k; + + THArgCheck(THTensor_(isContiguous)(weight), 4, "weight must be contiguous"); + THNN_(TemporalSubSampling_shapeCheck)(state, input, gradOutput, kW, dW, NULL); + + gradOutputFrame = THTensor_(new)(); + gradInputWindow = THTensor_(new)(); + buffer = THTensor_(new)(); + kwunit = THTensor_(newWithSize1d)(kW); + + THTensor_(fill)(kwunit, 1); + THTensor_(resizeAs)(gradInput, input); + THTensor_(zero)(gradInput); + + for(k = 0; k < gradOutput->size[0]; k++) + { + THTensor_(narrow)(gradInputWindow, gradInput, 0, k*dW, kW); + THTensor_(select)(gradOutputFrame, gradOutput, 0, k); + THTensor_(cmul)(buffer, weight, gradOutputFrame); + THTensor_(addr)(gradInputWindow, 1, gradInputWindow, 1, kwunit, buffer); + } + + THTensor_(free)(gradOutputFrame); + THTensor_(free)(gradInputWindow); + THTensor_(free)(buffer); + THTensor_(free)(kwunit); +} + +void THNN_(TemporalSubSampling_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + int kW, + int dW, + accreal scale_) +{ + real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); + THTensor *gradOutputFrame; + THTensor *inputWindow, *buffer; + long k; + + THNN_(TemporalSubSampling_shapeCheck)(state, input, gradOutput, kW, dW, NULL); + gradOutputFrame = THTensor_(new)(); + inputWindow = THTensor_(new)(); + buffer = THTensor_(new)(); + + for(k = 0; k < gradOutput->size[0]; k++) + { + THTensor_(narrow)(inputWindow, input, 0, k*dW, kW); + THTensor_(select)(gradOutputFrame, gradOutput, 0, k); + THTensor_(sum)(buffer, inputWindow, 0, 1); + THTensor_(addcmul)(gradWeight, gradWeight, scale, buffer, gradOutputFrame); + THTensor_(cadd)(gradBias, gradBias, scale, gradOutputFrame); + } + + THTensor_(free)(gradOutputFrame); + THTensor_(free)(inputWindow); + THTensor_(free)(buffer); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/Threshold.c b/contrib/lua-torch/nn/lib/THNN/generic/Threshold.c new file mode 100644 index 000000000..949c7a07c --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/Threshold.c @@ -0,0 +1,64 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/Threshold.c" +#else + +void THNN_(Threshold_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + accreal threshold_, + accreal val_, + bool inplace) +{ + real threshold = TH_CONVERT_ACCREAL_TO_REAL(threshold_); + real val = TH_CONVERT_ACCREAL_TO_REAL(val_); + if (inplace) + { + TH_TENSOR_APPLY(real, input, + if (*input_data <= threshold) + *input_data = val; + ); + THTensor_(set)(output, input); + } + else + { + THTensor_(resizeAs)(output, input); + TH_TENSOR_APPLY2(real, output, real, input, + *output_data = (*input_data > threshold) ? *input_data : val; + ); + } +} + +void THNN_(Threshold_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + accreal threshold_, + accreal val_, + bool inplace) +{ + real threshold = TH_CONVERT_ACCREAL_TO_REAL(threshold_); + real val = TH_CONVERT_ACCREAL_TO_REAL(val_); + THNN_CHECK_NELEMENT(input, gradOutput); + if (inplace) + { + TH_TENSOR_APPLY2(real, gradOutput, real, input, + if ((*input_data) <= threshold) + *gradOutput_data = 0; + ); + THTensor_(set)(gradInput, gradOutput); + } + else + { + THTensor_(resizeAs)(gradInput, input); + TH_TENSOR_APPLY3(real, gradInput, real, gradOutput, real, input, + if ((*input_data) > threshold) + *gradInput_data = *gradOutput_data; + else + *gradInput_data = 0; + ); + } +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricAveragePooling.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricAveragePooling.c new file mode 100644 index 000000000..91c870e6f --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricAveragePooling.c @@ -0,0 +1,373 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/VolumetricAveragePooling.c" +#else + +static inline void THNN_(VolumetricAveragePooling_shapeCheck)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + int kT, + int kW, + int kH, + int dT, + int dW, + int dH) { + long nslices; + long itime; + long iheight; + long iwidth; + long otime; + long oheight; + long owidth; + int ndim = input->nDimension; + int dimN = 0; + int dimt = 1; + int dimh = 2; + int dimw = 3; + + if (input->nDimension == 5) + { + dimN++; + dimt++; + dimh++; + dimw++; + } + + THArgCheck(kT > 0 && kW > 0 && kH > 0, 5, + "kernel size should be greater than zero, but got kT: %d kH: %d kW: %d", + kT, kH, kW); + THArgCheck(dT > 0 && dW > 0 && dH > 0, 8, + "stride should be greater than zero, but got dT: %d dH: %d dW: %d", + dT, dH, dW); + THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input, + "4D or 5D (batch mode) tensor expected for input, but got: %s"); + + THArgCheck(input->size[dimw] >= kW && input->size[dimh] >= kH + && input->size[dimt] >= kT, 2, + "input image (T: %d H: %d W: %d) smaller than " + "kernel size (kT: %d kH: %d kW: %d)", + input->size[dimt], input->size[dimh], input->size[dimw], + kT, kH, kW); + + /* sizes */ + nslices = input->size[dimN]; + itime = input->size[dimt]; + iheight = input->size[dimh]; + iwidth = input->size[dimw]; + otime = (itime - kT) / dT + 1; + oheight = (iheight - kH) / dH + 1; + owidth = (iwidth - kW) / dW + 1; + + if (gradOutput != NULL) { + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimN, nslices); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimt, otime); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, oheight); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, owidth); + } +} + +static void THNN_(VolumetricAveragePooling_updateOutput_frame)( + real *input_p, + real *output_p, + long nslices, + long itime, + long iwidth, + long iheight, + long otime, + long owidth, + long oheight, + int kT, + int kW, + int kH, + int dT, + int dW, + int dH) +{ + long k; +#pragma omp parallel for private(k) + for (k = 0; k < nslices; k++) + { + /* loop over output */ + long i, j, ti; + for (ti = 0; ti < otime; ti++) + { + for (i = 0; i < oheight; i++) + { + for (j = 0; j < owidth; j++) + { + /* local pointers */ + real *ip = input_p + k * itime * iwidth * iheight + + ti * iwidth * iheight * dT + i * iwidth * dH + j * dW; + real *op = output_p + k * otime * owidth * oheight + + ti * owidth * oheight + i * owidth + j; + + /* compute local sum: */ + real sum = 0.0; + int x, y, z; + + for (z=0; z < kT; z++) + { + for (y = 0; y < kH; y++) + { + for (x = 0; x < kW; x++) + { + sum += *(ip + z * iwidth * iheight + y * iwidth + x); + } + } + } + + /* set output to local max */ + *op = sum / (kT * kW * kH); + } + } + } + } +} + +void THNN_(VolumetricAveragePooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int kT, + int kW, + int kH, + int dT, + int dW, + int dH) +{ + long nslices; + long itime; + long iheight; + long iwidth; + long otime; + long oheight; + long owidth; + real *input_data; + real *output_data; + + THNN_(VolumetricAveragePooling_shapeCheck)( + state, input, NULL, kT, kW, kH, + dT, dW, dH); + + int dimN = 0; + int dimt = 1; + int dimh = 2; + int dimw = 3; + + if (input->nDimension == 5) + { + dimN++; + dimt++; + dimh++; + dimw++; + } + + /* sizes */ + nslices = input->size[dimN]; + itime = input->size[dimt]; + iheight = input->size[dimh]; + iwidth = input->size[dimw]; + otime = (itime - kT) / dT + 1; + oheight = (iheight - kH) / dH + 1; + owidth = (iwidth - kW) / dW + 1; + + /* get contiguous input */ + input = THTensor_(newContiguous)(input); + + if (input->nDimension == 4) /* non-batch mode */ + { + /* resize output */ + THTensor_(resize4d)(output, nslices, otime, oheight, owidth); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + + THNN_(VolumetricAveragePooling_updateOutput_frame)( + input_data, output_data, nslices, + itime, iwidth, iheight, + otime, owidth, oheight, + kT, kW, kH, + dT, dW, dH + ); + } + else /* batch mode */ + { + long p; + long nBatch = input->size[0]; + + long istride = nslices * itime * iwidth * iheight; + long ostride = nslices * otime * owidth * oheight; + + /* resize output */ + THTensor_(resize5d)(output, nBatch, nslices, otime, oheight, owidth); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + +#pragma omp parallel for private(p) + for (p=0; p < nBatch; p++) + { + THNN_(VolumetricAveragePooling_updateOutput_frame)( + input_data + p * istride, output_data + p * ostride, nslices, + itime, iwidth, iheight, + otime, owidth, oheight, + kT, kW, kH, + dT, dW, dH + ); + } + } + + /* cleanup */ + THTensor_(free)(input); +} + +static void THNN_(VolumetricAveragePooling_updateGradInput_frame)( + real *gradInput_p, + real *gradOutput_p, + long nslices, + long itime, + long iwidth, + long iheight, + long otime, + long owidth, + long oheight, + int kT, + int kW, + int kH, + int dT, + int dW, + int dH) +{ + long k; +#pragma omp parallel for private(k) + for (k = 0; k < nslices; k++) + { + /* loop over output */ + long i, j, ti; + for (ti = 0; ti < otime; ti++) + { + for (i = 0; i < oheight; i++) + { + for (j = 0; j < owidth; j++) + { + /* local pointers */ + real *ip = gradInput_p + k * itime * iwidth * iheight + + ti * iwidth * iheight * dT + i * iwidth * dH + j * dW; + real *op = gradOutput_p + k * otime * owidth * oheight + + ti * owidth * oheight + i * owidth + j; + + /* scatter gradients out to footprint: */ + real val = *op / (kT * kW * kH); + int x,y,z; + for (z=0; z < kT; z++) + { + for (y = 0; y < kH; y++) + { + for (x = 0; x < kW; x++) + { + *(ip + z * iwidth * iheight + y * iwidth + x) += val; + } + } + } + } + } + } + } +} + +void THNN_(VolumetricAveragePooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + int kT, + int kW, + int kH, + int dT, + int dW, + int dH) +{ + int nslices; + int itime; + int iheight; + int iwidth; + int otime; + int oheight; + int owidth; + real *gradInput_data; + real *gradOutput_data; + + int dimN = 0; + int dimt = 1; + int dimh = 2; + int dimw = 3; + + THNN_(VolumetricAveragePooling_shapeCheck)( + state, input, gradOutput, kT, kW, kH, + dT, dW, dH); + + /* get contiguous gradOutput */ + gradOutput = THTensor_(newContiguous)(gradOutput); + + /* resize */ + THTensor_(resizeAs)(gradInput, input); + THTensor_(zero)(gradInput); + + if (input->nDimension == 5) + { + dimN++; + dimt++; + dimh++; + dimw++; + } + + /* sizes */ + nslices = input->size[dimN]; + itime = input->size[dimt]; + iheight = input->size[dimh]; + iwidth = input->size[dimw]; + otime = gradOutput->size[dimt]; + oheight = gradOutput->size[dimh]; + owidth = gradOutput->size[dimw]; + + /* get raw pointers */ + gradInput_data = THTensor_(data)(gradInput); + gradOutput_data = THTensor_(data)(gradOutput); + + /* backprop */ + if (input->nDimension == 4) /* non-batch mode*/ + { + THNN_(VolumetricAveragePooling_updateGradInput_frame)( + gradInput_data, gradOutput_data, nslices, + itime, iwidth, iheight, + otime, owidth, oheight, + kT, kW, kH, + dT, dW, dH + ); + } + else /* batch mode */ + { + long p; + long nBatch = input->size[0]; + + long istride = nslices * itime * iwidth * iheight; + long ostride = nslices * otime * owidth * oheight; + +#pragma omp parallel for private(p) + for (p = 0; p < nBatch; p++) + { + THNN_(VolumetricAveragePooling_updateGradInput_frame)( + gradInput_data + p * istride, gradOutput_data + p * ostride, nslices, + itime, iwidth, iheight, + otime, owidth, oheight, + kT, kW, kH, + dT, dW, dH + ); + } + } + + /* cleanup */ + THTensor_(free)(gradOutput); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricConvolution.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricConvolution.c new file mode 100644 index 000000000..be1aa82e6 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricConvolution.c @@ -0,0 +1,260 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/VolumetricConvolution.c" +#else + +void THNN_(VolumetricConvolution_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + THTensor *finput, // only used by cuda impl + THTensor *fgradInput, // only used by cuda impl + int dT, + int dW, + int dH, + int pT, + int pW, + int pH) +{ + THArgCheck(pT != 0 || pW != 0 || pH != 0, 9, "padding not supported by CPU backend"); // sharing signature with CUDA version + + THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input, + "4D or 5D (batch mode) tensor expected for input, but got: %s"); + + int dimt = 1; + int dimh = 2; + int dimw = 3; + + if (input->nDimension == 5) + { + dimt++; + dimh++; + dimw++; + } + + long nOutputPlane = weight->size[0]; + long kT = weight->size[2]; + long kH = weight->size[3]; + long kW = weight->size[4]; + long inputDepth = input->size[dimt]; + long inputHeight = input->size[dimh]; + long inputWidth = input->size[dimw]; + long outputDepth = (inputDepth - kT) / dT + 1; + long outputWidth = (inputWidth - kW) / dW + 1; + long outputHeight = (inputHeight - kH) / dH + 1; + THTensor *outn = THTensor_(new)(); + long i, j; + if (input->nDimension == 4) /* non-batch mode */ + { + THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth); + + /* add bias */ + if (bias) { + for (i = 0; i < bias->size[0]; i++) + { + THTensor_(select)(outn, output, 0, i); + THTensor_(fill)(outn, THTensor_(get1d)(bias, i)); + } + } else { + THTensor_(zero)(output); + } + + /* do convolutions */ + THTensor_(conv3Dmv)(output, 1.0, 1.0, input, weight, dT, dH, dW, "V", "X"); + } + else /* batch mode */ + { + long nBatch = input->size[0]; + THTensor_(resize5d)(output, nBatch, nOutputPlane, outputDepth, outputHeight, outputWidth); + THTensor *inb = THTensor_(new)(); + THTensor *outb = THTensor_(new)(); + + /* loop over batches */ + for (j = 0; j < nBatch; j++) + { + THTensor_(select)(inb, input, 0, j); + THTensor_(select)(outb, output, 0, j); + + /* add bias */ + if (bias) { + for (i = 0; i < bias->size[0]; i++) + { + THTensor_(select)(outn, outb, 0, i); + THTensor_(fill)(outn, THTensor_(get1d)(bias, i)); + } + } else { + THTensor_(zero)(outb); + } + + /* do convolutions */ + THTensor_(conv3Dmv)(outb, 1.0, 1.0, inb, weight, dT, dH, dW, "V", "X"); + } + + THTensor_(free)(inb); + THTensor_(free)(outb); + } + THTensor_(free)(outn); +} + +void THNN_(VolumetricConvolution_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *finput, // only used by cuda impl + int dT, + int dW, + int dH, + int pT, + int pW, + int pH) +{ + THArgCheck(pT != 0 || pW != 0 || pH != 0, 9, "padding not supported by CPU backend"); // sharing signature with CUDA version + + THNN_ARGCHECK(weight->nDimension == 5, 4, weight, + "5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor " + "expected for weight, but got: %s"); + + int nOutputPlane = (int)weight->size[0]; + + THNN_ARGCHECK(gradOutput->nDimension == 4 || gradOutput->nDimension == 5, 3, + gradOutput, + "4D or 5D (batch mode) tensor expected for gradOutput, but got: %s"); + + int dimPlane = 0; + if (gradOutput->nDimension == 5) + { + dimPlane++; + } + + THArgCheck(nOutputPlane == gradOutput->size[dimPlane], 1, + "Number of output features is not equal to nOutputPlane" + ); + + /* gradient to input */ + THTensor *tweight = THTensor_(newTranspose)(weight, 0, 1); + if (gradOutput->nDimension == 4) /* non-batch mode */ + { + THTensor_(conv3Dmv)(gradInput, 0.0, 1.0, gradOutput, tweight, dT, dH, dW, "F", "C"); + } + else /* batch mode */ + { + long nBatch = gradOutput->size[0]; + THTensor *ginpb = THTensor_(new)(); + THTensor *goutb = THTensor_(new)(); + long j; + + THTensor_(resize5d)(gradInput, + input->size[0], input->size[1], input->size[2], input->size[3], input->size[4] + ); + + /* loop over batches */ + for (j = 0; j < nBatch; j++) + { + THTensor_(select)(ginpb, gradInput, 0, j); + THTensor_(select)(goutb, gradOutput, 0, j); + THTensor_(conv3Dmv)(ginpb, 0.0, 1.0, goutb, tweight, dT, dH, dW, "F", "C"); + } + THTensor_(free)(ginpb); + THTensor_(free)(goutb); + } + + THTensor_(free)(tweight); +} + +void THNN_(VolumetricConvolution_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *finput, // only used by cuda impl + THTensor *fgradInput, // only used by cuda impl + int dT, + int dW, + int dH, + int pT, + int pW, + int pH, + accreal scale_) +{ + real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); + THArgCheck(pT != 0 || pW != 0 || pH != 0, 9, "padding not supported by CPU backend"); // sharing signature with CUDA version + + THNN_ARGCHECK(gradWeight->nDimension == 5, 4, gradWeight, + "5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor " + "expected for gradWeight, but got: %s"); + + int nOutputPlane = (int)gradWeight->size[0]; + if (gradBias) { + THArgCheck(gradBias->nDimension == 1 && gradBias->size[0] == nOutputPlane, 5, + "gradBias tensor has wrong size" + ); + } + + long k; + real *gradBias_data; + THTensor *gradOutSlice; + int dimPlane = 0; + if (gradOutput->nDimension == 5) + { + dimPlane++; + } + + THArgCheck(nOutputPlane == gradOutput->size[dimPlane], 1, + "Number of output features is not equal to nOutputPlane" + ); + + if (gradOutput->nDimension == 4) /* non-batch mode */ + { + /* gradient to bias */ + if (gradBias) { + gradBias_data = THTensor_(data)(gradBias); + gradOutSlice = THTensor_(new)(); + for (k = 0; k < nOutputPlane; k++) + { + THTensor_(select)(gradOutSlice, gradOutput, 0, k); + gradBias_data[k] += scale * THTensor_(sumall)(gradOutSlice); + } + THTensor_(free)(gradOutSlice); + } + + /* gradient to kernels */ + THTensor_(conv3DRevger)(gradWeight, 1.0, scale, input, gradOutput, dT, dH, dW); + } + else /* batch mode */ + { + long nBatch = gradOutput->size[0]; + THTensor *inpb = THTensor_(new)(); + THTensor *goutb = THTensor_(new)(); + long j; + + /* loop over batches */ + for (j = 0; j < nBatch; j++) + { + THTensor_(select)(inpb, input, 0, j); + THTensor_(select)(goutb, gradOutput, 0, j); + + /* gradient to bias */ + if (gradBias) { + gradBias_data = THTensor_(data)(gradBias); + gradOutSlice = THTensor_(new)(); + for (k = 0; k < nOutputPlane; k++) + { + THTensor_(select)(gradOutSlice, goutb, 0, k); + gradBias_data[k] += scale * THTensor_(sumall)(gradOutSlice); + } + THTensor_(free)(gradOutSlice); + } + + /* gradient to kernels */ + THTensor_(conv3DRevger)(gradWeight, 1.0, scale, inpb, goutb, dT, dH, dW); + } + THTensor_(free)(inpb); + THTensor_(free)(goutb); + } +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricConvolutionMM.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricConvolutionMM.c new file mode 100644 index 000000000..00a121db6 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricConvolutionMM.c @@ -0,0 +1,628 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/VolumetricConvolutionMM.c" +#else + +static void inline THNN_(VolumetricConvolutionMM_shapeCheck)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *weight, + THTensor *bias, + int kT, + int kW, + int kH, + int dT, + int dW, + int dH, + int pT, + int pW, + int pH) { + THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input, + "4D or 5D (batch mode) tensor expected for input, but got: %s"); + THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, + "kernel size should be greater than zero, but got kT: %d kH: %d kW: %d", kT, kH, kW); + THArgCheck(dT > 0 && dW > 0 && dH > 0, 11, + "stride should be greater than zero, but got dT: %d dH: %d dW: %d", dT, dH, dW); + + int ndim = input->nDimension; + int dimf = 0; + int dimt = 1; + int dimh = 2; + int dimw = 3; + + if (ndim == 5) + { + dimf++; + dimt++; + dimh++; + dimw++; + } + + long nInputPlane; + long inputDepth; + long inputHeight; + long inputWidth; + long nOutputPlane; + long outputDepth; + long outputHeight; + long outputWidth; + + nInputPlane = input->size[dimf]; + inputDepth = input->size[dimt]; + inputHeight = input->size[dimh]; + inputWidth = input->size[dimw]; + nOutputPlane = weight->size[0]; + outputDepth = (inputDepth + 2*pT - kT) / dT + 1; + outputHeight = (inputHeight + 2*pH - kH) / dH + 1; + outputWidth = (inputWidth + 2*pW - kW) / dW + 1; + + if (outputWidth < 1 || outputHeight < 1 || outputDepth < 1) + { + THError( + "Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small", + nInputPlane, inputDepth, inputHeight, inputWidth, + nOutputPlane, outputDepth, outputHeight, outputWidth + ); + } + + THArgCheck(weight->nDimension == 2 || weight->nDimension == 5, 4, + "weight tensor should be 2D or 5D - got %d", weight->nDimension); + + if (bias != NULL) { + THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]); + } + + THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane); + + if (gradOutput != NULL) { + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimt, outputDepth); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth); + } +} + +static int THNN_(view_weight)(THTensor **_weight) +{ + THTensor *weight = *_weight; + if (weight->nDimension == 5) { + long s1 = weight->size[0]; + long s2 = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4]; + *_weight = THTensor_(newWithStorage2d)(weight->storage, weight->storageOffset, s1, -1, s2, -1); + return 1; + } + return 0; +} + +/* note: due to write issues, this one cannot be parallelized as well as unfolded_copy */ +static void THNN_(unfolded_acc_vol)( + THTensor *finput, + THTensor *input, + int kT, + int kW, + int kH, + int dT, + int dW, + int dH, + int pT, + int pW, + int pH, + int nInputPlane, + int inputDepth, + int inputWidth, + int inputHeight, + int outputDepth, + int outputWidth, + int outputHeight) +{ + int nip; + real *input_data = THTensor_(data)(input); + real *finput_data = THTensor_(data)(finput); + +//#pragma omp parallel for private(nip) + for (nip = 0; nip < nInputPlane; nip++) + { + int kt, kw, kh, t, y, x, it, ix, iy; + for (kt = 0; kt < kT; kt++) + { + for (kh = 0; kh < kH; kh++) + { + for (kw = 0; kw < kW; kw++) + { + real *src = finput_data + + nip * (kT*kH*kW*outputDepth*outputHeight*outputWidth) + + kt * (kH*kW*outputDepth*outputHeight*outputWidth) + + kh * (kW*outputDepth*outputHeight*outputWidth) + + kw * (outputDepth*outputHeight*outputWidth); + + real *dst = input_data + nip*(inputDepth*inputHeight*inputWidth); + if (pT > 0 || pH > 0 || pW > 0) + { + for (t = 0; t < outputDepth; t++) + { + it = t*dT - pT + kt; + for (y = 0; y < outputHeight; y++) + { + iy = y*dH - pH + kh; + for (x = 0; x < outputWidth; x++) + { + ix = x*dW - pW + kw; + if (it < 0 || it >= inputDepth || iy < 0 || iy >= inputHeight || ix < 0 || ix >= inputWidth) + { + } + else + { + real *dst_slice = dst+it*inputHeight*inputWidth+iy*inputWidth+ix; + THVector_(cadd)(dst_slice, dst_slice, src+t*outputHeight*outputWidth+y*outputWidth+x, 1, 1); + } + } + } + } + } + else + { + for (t = 0; t < outputDepth; t++) + { + it = t*dT + kt; + for (y = 0; y < outputHeight; y++) + { + iy = y*dH + kh; + for(x = 0; x < outputWidth; x++) + { + ix = x*dW + kw; + real *dst_slice = dst+it*inputHeight*inputWidth+iy*inputWidth+ix; + THVector_(cadd)(dst_slice, dst_slice, src+t*outputHeight*outputWidth+y*outputWidth+x, 1, 1); + } + } + } + } + } + } + } + } +} + +static void THNN_(unfolded_copy_vol)( + THTensor *finput, + THTensor *input, + int kT, + int kW, + int kH, + int dT, + int dW, + int dH, + int pT, + int pW, + int pH, + int nInputPlane, + int inputDepth, + int inputWidth, + int inputHeight, + int outputDepth, + int outputWidth, + int outputHeight) +{ + long k; + real *input_data = THTensor_(data)(input); + real *finput_data = THTensor_(data)(finput); +// #pragma omp parallel for private(k) + for (k = 0; k < nInputPlane*kT*kH*kW; k++) + { + int nip = k / (kT*kH*kW); + int rest = k % (kT*kH*kW); + int kt = rest / (kH*kW); + rest = rest % (kH*kW); + int kh = rest / kW; + int kw = rest % kW; + int t,x,y,it,ix,iy; + real *dst = finput_data + + nip * (kT*kH*kW*outputDepth*outputHeight*outputWidth) + + kt * (kH*kW*outputDepth*outputHeight*outputWidth) + + kh * (kW*outputDepth*outputHeight*outputWidth) + + kw * (outputDepth*outputHeight*outputWidth); + real *src = input_data + nip*(inputDepth*inputHeight*inputWidth); + + if (pT > 0 || pH > 0 || pW > 0) + { + for (t = 0; t < outputDepth; t++) + { + it = t*dT - pT + kt; + for (y = 0; y < outputHeight; y++) + { + iy = y*dH - pH + kh; + for (x = 0; x < outputWidth; x++) + { + ix = x*dW - pW + kw; + if (it < 0 || it >= inputDepth || iy < 0 || iy >= inputHeight || ix < 0 || ix >= inputWidth) + memset(dst+t*outputHeight*outputWidth+y*outputWidth+x, 0, sizeof(real)*(1)); + else + memcpy(dst+t*outputHeight*outputWidth+y*outputWidth+x, src+it*inputHeight*inputWidth+iy*inputWidth+ix, sizeof(real)*(1)); + } + } + } + } + else + { + for (t = 0; t < outputDepth; t++) + { + it = t*dT + kt; + for (y = 0; y < outputHeight; y++) + { + iy = y*dH + kh; + for(x = 0; x < outputWidth; x++) + { + ix = x*dW + kw; + memcpy(dst+t*outputHeight*outputWidth+y*outputWidth+x, src+it*inputHeight*inputWidth+iy*inputWidth+ix, sizeof(real)*(1)); + } + } + } + } + } +} + +static void THNN_(VolumetricConvolutionMM_updateOutput_frame)( + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + THTensor *finput, + int kT, + int kW, + int kH, + int dT, + int dW, + int dH, + int pT, + int pW, + int pH, + long nInputPlane, + long inputDepth, + long inputWidth, + long inputHeight, + long nOutputPlane, + long outputDepth, + long outputWidth, + long outputHeight) +{ + long i; + THTensor *output2d; + + THNN_(unfolded_copy_vol)( + finput, input, + kT, kW, kH, + dT, dW, dH, + pT, pW, pH, + nInputPlane, + inputDepth, inputWidth, inputHeight, + outputDepth, outputWidth, outputHeight + ); + + output2d = THTensor_(newWithStorage2d)( + output->storage, output->storageOffset, nOutputPlane, -1, + outputDepth*outputHeight*outputWidth, -1 + ); + + if (bias) { + for (i = 0; i < nOutputPlane; i++) + { + THVector_(fill)( + output->storage->data+output->storageOffset+output->stride[0]*i, + THTensor_(get1d)(bias, i), + outputDepth*outputHeight*outputWidth + ); + } + } else { + THTensor_(zero)(output); + } + + THTensor_(addmm)(output2d, 1, output2d, 1, weight, finput); + + THTensor_(free)(output2d); +} + +void THNN_(VolumetricConvolutionMM_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + THTensor *finput, + int kT, + int kW, + int kH, + int dT, + int dW, + int dH, + int pT, + int pW, + int pH) +{ + int dimf = 0; + int dimt = 1; + int dimh = 2; + int dimw = 3; + int freeWeight = 0; + + long nInputPlane; + long inputDepth; + long inputHeight; + long inputWidth; + long nOutputPlane; + long outputDepth; + long outputHeight; + long outputWidth; + + THNN_(VolumetricConvolutionMM_shapeCheck)( + state, input, NULL, weight, bias, + kT, kW, kH, dT, dW, dH, pT, pW, pH); + input = THTensor_(newContiguous)(input); + + if (input->nDimension == 5) + { + dimf++; + dimt++; + dimh++; + dimw++; + } + + nInputPlane = input->size[dimf]; + inputDepth = input->size[dimt]; + inputHeight = input->size[dimh]; + inputWidth = input->size[dimw]; + nOutputPlane = weight->size[0]; + outputDepth = (inputDepth + 2*pT - kT) / dT + 1; + outputHeight = (inputHeight + 2*pH - kH) / dH + 1; + outputWidth = (inputWidth + 2*pW - kW) / dW + 1; + + freeWeight = THNN_(view_weight)(&weight); + + if (input->nDimension == 4) + { + THTensor_(resize2d)(finput, kT*kW*kH*nInputPlane, outputDepth*outputHeight*outputWidth); + THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth); + + THNN_(VolumetricConvolutionMM_updateOutput_frame)( + input, output, weight, bias, finput, + kT, kW, kH, + dT, dW, dH, + pT, pW, pH, + nInputPlane, inputDepth, inputWidth, inputHeight, + nOutputPlane, outputDepth, outputWidth, outputHeight + ); + } + else + { + long T = input->size[0]; + long t; + + THTensor_(resize3d)(finput, T, kT*kW*kH*nInputPlane, outputDepth*outputHeight*outputWidth); + THTensor_(resize5d)(output, T, nOutputPlane, outputDepth, outputHeight, outputWidth); + +// #pragma omp parallel for private(t) + for (t = 0; t < T; t++) + { + THTensor *input_t = THTensor_(newSelect)(input, 0, t); + THTensor *output_t = THTensor_(newSelect)(output, 0, t); + THTensor *finput_t = THTensor_(newSelect)(finput, 0, t); + + THNN_(VolumetricConvolutionMM_updateOutput_frame)( + input_t, output_t, weight, bias, finput_t, + kT, kW, kH, + dT, dW, dH, + pT, pW, pH, + nInputPlane, inputDepth, inputWidth, inputHeight, + nOutputPlane, outputDepth, outputWidth, outputHeight + ); + + THTensor_(free)(input_t); + THTensor_(free)(output_t); + THTensor_(free)(finput_t); + } + } + + THTensor_(free)(input); + if (freeWeight) + THTensor_(free)(weight); +} + +static void THNN_(VolumetricConvolutionMM_updateGradInput_frame)( + THTensor *gradInput, + THTensor *gradOutput, + THTensor *weight, + THTensor *fgradInput, + int kT, + int kW, + int kH, + int dT, + int dW, + int dH, + int pT, + int pW, + int pH) +{ + THTensor *gradOutput2d = THTensor_(newWithStorage2d)( + gradOutput->storage, gradOutput->storageOffset, + gradOutput->size[0], -1, + gradOutput->size[1]*gradOutput->size[2]*gradOutput->size[3], -1 + ); + + THTensor_(addmm)(fgradInput, 0, fgradInput, 1, weight, gradOutput2d); + THTensor_(free)(gradOutput2d); + + THTensor_(zero)(gradInput); + + THNN_(unfolded_acc_vol)( + fgradInput, gradInput, + kT, kW, kH, + dT, dW, dH, + pT, pW, pH, + gradInput->size[0], gradInput->size[1], gradInput->size[3], gradInput->size[2], + gradOutput->size[1], gradOutput->size[3], gradOutput->size[2] + ); +} + +void THNN_(VolumetricConvolutionMM_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *finput, + THTensor *fgradInput, + int kT, + int kW, + int kH, + int dT, + int dW, + int dH, + int pT, + int pW, + int pH) +{ + int nOutputPlane = (int)weight->size[0]; + + THNN_(VolumetricConvolutionMM_shapeCheck)( + state, input, gradOutput, weight, NULL, + kT, kW, kH, dT, dW, dH, pT, pW, pH); + input = THTensor_(newContiguous)(input); + gradOutput = THTensor_(newContiguous)(gradOutput); + + int freeWeight = THNN_(view_weight)(&weight); + + THTensor_(resizeAs)(gradInput, input); + THTensor_(resizeAs)(fgradInput, finput); + // depending on the BLAS library, fgradInput (result tensor) might + // be left uninitialized on zero alpha, which might lead to weird behavior + // hence, to be safe, zero it + THTensor_(zero)(fgradInput); + THTensor *tweight = THTensor_(new)(); + THTensor_(transpose)(tweight, weight, 0, 1); + + if (input->nDimension == 4) + { + THNN_(VolumetricConvolutionMM_updateGradInput_frame)( + gradInput, gradOutput, tweight, fgradInput, + kT, kW, kH, + dT, dW, dH, + pT, pW, pH + ); + } + else + { + long T = input->size[0]; + long t; + +//#pragma omp parallel for private(t) + for (t = 0; t < T; t++) + { + THTensor *gradInput_t = THTensor_(newSelect)(gradInput, 0, t); + THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t); + THTensor *fgradInput_t = THTensor_(newSelect)(fgradInput, 0, t); + + THNN_(VolumetricConvolutionMM_updateGradInput_frame)( + gradInput_t, gradOutput_t, tweight, fgradInput_t, + kT, kW, kH, + dT, dW, dH, + pT, pW, pH + ); + + THTensor_(free)(gradInput_t); + THTensor_(free)(gradOutput_t); + THTensor_(free)(fgradInput_t); + } + } + + THTensor_(free)(tweight); + THTensor_(free)(input); + THTensor_(free)(gradOutput); + if (freeWeight) + THTensor_(free)(weight); +} + +static void THNN_(VolumetricConvolutionMM_accGradParameters_frame)( + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *finput, + real scale) +{ + long i; + THTensor *gradOutput2d = THTensor_(newWithStorage2d)( + gradOutput->storage, gradOutput->storageOffset, + gradOutput->size[0], -1, + gradOutput->size[1]*gradOutput->size[2]*gradOutput->size[3], -1 + ); + + THTensor *tfinput = THTensor_(new)(); + THTensor_(transpose)(tfinput, finput, 0, 1); + THTensor_(addmm)(gradWeight, 1, gradWeight, scale, gradOutput2d, tfinput); + THTensor_(free)(tfinput); + + if (gradBias) { + for (i = 0; i < gradBias->size[0]; i++) + { + long k; + real sum = 0; + real *data = gradOutput2d->storage->data + gradOutput2d->storageOffset + i*gradOutput2d->stride[0]; + for (k = 0; k < gradOutput2d->size[1]; k++) + sum += data[k]; + + (gradBias->storage->data + gradBias->storageOffset)[i] += scale * sum; + } + } + + THTensor_(free)(gradOutput2d); +} + +void THNN_(VolumetricConvolutionMM_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *finput, + int kT, int kW, int kH, + int dT, int dW, int dH, + int pT, int pW, int pH, + accreal scale_) +{ + real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); + int freeWeight; + int nOutputPlane = (int)gradWeight->size[0]; + + THNN_(VolumetricConvolutionMM_shapeCheck)( + state, input, gradOutput, gradWeight, gradBias, + kT, kW, kH, dT, dW, dH, pT, pW, pH); + input = THTensor_(newContiguous)(input); + gradOutput = THTensor_(newContiguous)(gradOutput); + + freeWeight = THNN_(view_weight)(&gradWeight); + + if (input->nDimension == 4) // non-batch mode + { + THNN_(VolumetricConvolutionMM_accGradParameters_frame)(gradOutput, gradWeight, gradBias, finput, scale); + } + else // batch mode + { + long T = input->size[0]; + long t; + + for (t = 0; t < T; t++) + { + THTensor *gradOutput_t = THTensor_(newSelect)(gradOutput, 0, t); + THTensor *finput_t = THTensor_(newSelect)(finput, 0, t); + + THNN_(VolumetricConvolutionMM_accGradParameters_frame)(gradOutput_t, gradWeight, gradBias, finput_t, scale); + + THTensor_(free)(gradOutput_t); + THTensor_(free)(finput_t); + } + } + + THTensor_(free)(input); + THTensor_(free)(gradOutput); + if (freeWeight) + THTensor_(free)(gradWeight); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricDilatedConvolution.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricDilatedConvolution.c new file mode 100644 index 000000000..ca740f78e --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricDilatedConvolution.c @@ -0,0 +1,420 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/VolumetricDilatedConvolution.c" +#else + +static inline void THNN_(VolumetricDilatedConvolution_shapeCheck)( + THTensor *input, THTensor *gradOutput, + THTensor *weight, THTensor *bias, + int kT, int kH, int kW, int dT, int dH, int dW, + int padT, int padH, int padW, + int dilationT, int dilationH, int dilationW) { + THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input, + "4D or 5D (batch mode) tensor expected for input, but got: %s"); + THNN_ARGCHECK(weight->nDimension == 5, 4, weight, + "5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor " + "expected for weight, but got: %s"); + THArgCheck(kT > 0 && kW > 0 && kH > 0, 8, + "kernel size should be greater than zero, but got kT: %d kH: %d kW: %d", kT, kH, kW); + THArgCheck(dT > 0 && dW > 0 && dH > 0, 11, + "stride should be greater than zero, but got dT: %d dH: %d dW: %d", dT, dH, dW); + THArgCheck(dilationT > 0 && dilationW > 0 && dilationH > 0, 15, + "dilation should be greater than zero, but got dilationT: %d, dilationH: %d, dilationW: %d", + dilationT, dilationH, dilationW); + if (bias != NULL) { + THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[0]); + } + + // Params + int ndim = input->nDimension; + int nInputPlane = weight->size[1]; + int nOutputPlane = weight->size[0]; + int dimf = 0; + int dimd = 1; + int dimh = 2; + int dimw = 3; + + if (ndim == 5) { + dimf++; + dimd++; + dimh++; + dimw++; + } + + long inputDepth = input->size[dimd]; + long inputHeight = input->size[dimh]; + long inputWidth = input->size[dimw]; + long outputDepth = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1; + long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; + long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; + + if (outputDepth < 1 || outputWidth < 1 || outputHeight < 1) + THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small", + nInputPlane,inputDepth,inputHeight,inputWidth,nOutputPlane,outputDepth,outputHeight,outputWidth); + + THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane); + + if (gradOutput != NULL) { + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimd, outputDepth); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth); + } +} + +void THNN_(VolumetricDilatedConvolution_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THTensor *weight, + THTensor *bias, + THTensor *columns, + THTensor *ones, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int dilationT, int dilationW, int dilationH) +{ + THNN_(VolumetricDilatedConvolution_shapeCheck)( + input, NULL, weight, bias, + kT, kH, kW, dT, dH, dW, padT, padH, padW, + dilationT, dilationH, dilationW); + + // Params: + int nInputPlane = weight->size[1]; + int nOutputPlane = weight->size[0]; + + input = THTensor_(newContiguous)(input); + weight = THTensor_(newContiguous)(weight); + bias = bias ? THTensor_(newContiguous)(bias) : bias; + int batch = 1; + if (input->nDimension == 4) { + // Force batch + batch = 0; + THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); + } + + long inputDepth = input->size[2]; + long inputHeight = input->size[3]; + long inputWidth = input->size[4]; + long outputDepth = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1; + long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; + long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; + + // Batch size + input planes + long batchSize = input->size[0]; + + // Resize output + THTensor_(resize5d)(output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth); + THTensor_(zero)(output); + + // Resize temporary columns + THTensor_(resize2d)(columns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth); + + // Define a buffer of ones, for bias accumulation + // Note: this buffer can be shared with other modules, it only ever gets increased, + // and always contains ones. + if (ones->nDimension != 3 || + ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) { + // Resize plane and fill with ones... + THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth); + THTensor_(fill)(ones, 1); + } + + // Helpers + THTensor *input_n = THTensor_(new)(); + THTensor *output_n = THTensor_(new)(); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per output: + THTensor_(select)(input_n, input, 0, elt); + THTensor_(select)(output_n, output, 0, elt); + + // Do Bias first: + // M,N,K are dims of matrix A and B + long m_ = nOutputPlane; + long n_ = outputDepth * outputHeight * outputWidth; + long k_ = 1; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + if (bias) { + THBlas_(gemm)( + 't', 'n', + n_, m_, k_, + 1, + THTensor_(data)(ones), k_, + THTensor_(data)(bias), k_, + 0, + THTensor_(data)(output_n), n_ + ); + } else { + THTensor_(zero)(output_n); + } + + // Extract columns: + THNN_(vol2col)( + THTensor_(data)(input_n), + nInputPlane, inputDepth, inputHeight, inputWidth, + kT, kH, kW, padT, padH, padW, dT, dH, dW, + dilationT, dilationH, dilationW, + THTensor_(data)(columns) + ); + + // M,N,K are dims of matrix A and B + long m = nOutputPlane; + long n = columns->size[1]; + long k = nInputPlane*kT*kH*kW; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + THBlas_(gemm)( + 'n', 'n', + n, m, k, + 1, + THTensor_(data)(columns), n, + THTensor_(data)(weight), k, + 1, + THTensor_(data)(output_n), n + ); + } + + // Free + THTensor_(free)(input_n); + THTensor_(free)(output_n); + + // Resize output + if (batch == 0) { + THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth); + THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth); + } + + THTensor_(free)(input); + THTensor_(free)(weight); + if (bias) THTensor_(free)(bias); +} + +void THNN_(VolumetricDilatedConvolution_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *gradColumns, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int dilationT, int dilationW, int dilationH) +{ + THNN_(VolumetricDilatedConvolution_shapeCheck)( + input, gradOutput, weight, NULL, + kT, kH, kW, dT, dH, dW, padT, padH, padW, + dilationT, dilationH, dilationW); + + // Params + int nInputPlane = weight->size[1]; + int nOutputPlane = weight->size[0]; + + input = THTensor_(newContiguous)(input); + gradOutput = THTensor_(newContiguous)(gradOutput); + weight = THTensor_(newContiguous)(weight); + + int batch = 1; + if (input->nDimension == 4) { + // Force batch + batch = 0; + THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); + THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]); + } + + long inputDepth = input->size[2]; + long inputWidth = input->size[4]; + long inputHeight = input->size[3]; + long outputDepth = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1; + long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; + long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; + + // Batch size + input planes + long batchSize = input->size[0]; + + // Resize output + THTensor_(resize5d)(gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth); + + // Resize temporary columns + THTensor_(resize2d)(gradColumns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth); + THTensor_(zero)(gradColumns); + + // Helpers + THTensor *gradInput_n = THTensor_(new)(); + THTensor *gradOutput_n = THTensor_(new)(); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per sample: + THTensor_(select)(gradInput_n, gradInput, 0, elt); + THTensor_(select)(gradOutput_n, gradOutput, 0, elt); + + // M,N,K are dims of matrix A and B + long m = nInputPlane*kT*kW*kH; + long n = gradColumns->size[1]; + long k = nOutputPlane; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + THBlas_(gemm)( + 'n', 't', + n, m, k, + 1, + THTensor_(data)(gradOutput_n), n, + THTensor_(data)(weight), m, + 0, + THTensor_(data)(gradColumns), n + ); + + // Unpack columns back into input: + THNN_(col2vol)( + THTensor_(data)(gradColumns), + nInputPlane, inputDepth, inputHeight, inputWidth, + kT, kH, kW, padT, padH, padW, dT, dH, dW, + dilationT, dilationH, dilationW, + THTensor_(data)(gradInput_n) + ); + } + + // Free + THTensor_(free)(gradInput_n); + THTensor_(free)(gradOutput_n); + + // Resize output + if (batch == 0) { + THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth); + THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth); + THTensor_(resize4d)(gradInput, nInputPlane, inputDepth, inputHeight, inputWidth); + } + + THTensor_(free)(input); + THTensor_(free)(gradOutput); + THTensor_(free)(weight); +} + +void THNN_(VolumetricDilatedConvolution_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *columns, + THTensor *ones, + int kT, int kW, int kH, + int dT, int dW, int dH, + int padT, int padW, int padH, + int dilationT, int dilationW, int dilationH, + accreal scale_) +{ + real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); + THNN_(VolumetricDilatedConvolution_shapeCheck)( + input, gradOutput, gradWeight, gradBias, + kT, kH, kW, dT, dH, dW, padT, padH, padW, + dilationT, dilationH, dilationW); + + // Params + int nInputPlane = gradWeight->size[1]; + int nOutputPlane = gradWeight->size[0]; + + input = THTensor_(newContiguous)(input); + gradOutput = THTensor_(newContiguous)(gradOutput); + + int batch = 1; + if (input->nDimension == 4) { + // Force batch + batch = 0; + THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); + THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]); + } + + long inputDepth = input->size[2]; + long inputWidth = input->size[4]; + long inputHeight = input->size[3]; + long outputDepth = (inputDepth + 2*padT - (dilationT * (kT - 1) + 1)) / dT + 1; + long outputWidth = (inputWidth + 2*padW - (dilationW * (kW - 1) + 1)) / dW + 1; + long outputHeight = (inputHeight + 2*padH - (dilationH * (kH - 1) + 1)) / dH + 1; + + // Batch size + input planes + long batchSize = input->size[0]; + + // Define a buffer of ones, for bias accumulation + if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) { + // Resize plane and fill with ones... + THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth); + THTensor_(fill)(ones, 1); + } + + // Resize temporary columns + THTensor_(resize2d)(columns, nInputPlane*kT*kW*kH, outputDepth*outputHeight*outputWidth); + + // Helpers + THTensor *input_n = THTensor_(new)(); + THTensor *gradOutput_n = THTensor_(new)(); + + // For each elt in batch, do: + for (int elt = 0; elt < batchSize; elt ++) { + // Matrix mulitply per output: + THTensor_(select)(input_n, input, 0, elt); + THTensor_(select)(gradOutput_n, gradOutput, 0, elt); + + // Extract columns: + THNN_(vol2col)( + THTensor_(data)(input_n), + nInputPlane, inputDepth, inputHeight, inputWidth, + kT, kH, kW, padT, padH, padW, dT, dH, dW, + dilationT, dilationH, dilationW, + THTensor_(data)(columns) + ); + + // M,N,K are dims of matrix A and B + long m = nOutputPlane; + long n = nInputPlane*kT*kW*kH; + long k = columns->size[1]; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + THBlas_(gemm)( + 't', 'n', + n, m, k, + scale, + THTensor_(data)(columns), k, + THTensor_(data)(gradOutput_n), k, + 1, + THTensor_(data)(gradWeight), n + ); + + // Do Bias: + // M,N,K are dims of matrix A and B + long m_ = nOutputPlane; + long k_ = outputDepth * outputHeight * outputWidth; + + // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices) + if (gradBias) { + THBlas_(gemv)( + 't', + k_, m_, + scale, + THTensor_(data)(gradOutput_n), k_, + THTensor_(data)(ones), 1, + 1, + THTensor_(data)(gradBias), 1 + ); + } + } + + // Free + THTensor_(free)(input_n); + THTensor_(free)(gradOutput_n); + + // Resize + if (batch == 0) { + THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth); + THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth); + } + + THTensor_(free)(input); + THTensor_(free)(gradOutput); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricDilatedMaxPooling.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricDilatedMaxPooling.c new file mode 100644 index 000000000..66c0f9531 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricDilatedMaxPooling.c @@ -0,0 +1,515 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/VolumetricDilatedMaxPooling.c" +#else + +static inline void THNN_(VolumetricDilatedMaxPooling_shapeCheck)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THIndexTensor *indices, + int kT, int kW, int kH, + int dT, int dW, int dH, + int pT, int pW, int pH, + int dilationT, int dilationW, int dilationH, + bool ceilMode) { + int ndim = input->nDimension; + int dimN = 0; + int dimt = 1; + int dimh = 2; + int dimw = 3; + long nslices; + long itime; + long iheight; + long iwidth; + long otime; + long oheight; + long owidth; + + THArgCheck(kT > 0 && kW > 0 && kH > 0, 5, + "kernel size should be greater than zero, but got kT: %d kH: %d kW: %d", + kT, kH, kW); + THArgCheck(dT > 0 && dW > 0 && dH > 0, 8, + "stride should be greater than zero, but got dT: %d dH: %d dW: %d", + dT, dH, dW); + THArgCheck(dilationT > 0 && dilationW > 0 && dilationH > 0, 14, + "dilation should be greater than 0, but got dilationT: %d dilationH: %d dilationW: %d", + dilationT, dilationH, dilationW); + + THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input, + "4D or 5D (batch mode) tensor expected for input, but got: %s"); + + if (input->nDimension == 5) + { + dimN++; + dimt++; + dimh++; + dimw++; + } + + THArgCheck(kT/2 >= pT && kW/2 >= pW && kH/2 >= pH, 2, + "pad should be smaller than half of kernel size, but got " + "kT: %d kW: %d, kH: %d, padT: %d, padW: %d, padH: %d", + kT, kW, kH, pT, pW, pH); + + nslices = input->size[dimN]; + itime = input->size[dimt]; + iheight = input->size[dimh]; + iwidth = input->size[dimw]; + if (ceilMode) + { + otime = (int)(ceil((float)(itime - (dilationT * (kT - 1) + 1) + 2*pT) / dT)) + 1; + oheight = (int)(ceil((float)(iheight - (dilationH * (kH - 1) + 1) + 2*pH) / dH)) + 1; + owidth = (int)(ceil((float)(iwidth - (dilationW * (kW - 1) + 1) + 2*pW) / dW)) + 1; + } + else + { + otime = (int)(floor((float)(itime - (dilationT * (kT - 1) + 1) + 2*pT) / dT)) + 1; + oheight = (int)(floor((float)(iheight - (dilationH * (kH - 1) + 1) + 2*pH) / dH)) + 1; + owidth = (int)(floor((float)(iwidth - (dilationW * (kW - 1) + 1) + 2*pW) / dW)) + 1; + } + + if (pT || pW || pH) + { + // ensure that the last pooling starts inside the image + if ((otime - 1)*dT >= itime + pT) + --otime; + if ((oheight - 1)*dH >= iheight + pH) + --oheight; + if ((owidth - 1)*dW >= iwidth + pW) + --owidth; + } + + if (otime < 1 || owidth < 1 || oheight < 1) + THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small", + nslices,itime,iheight,iwidth,nslices,otime,oheight,owidth); + + if (gradOutput != NULL) { + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimN, nslices); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimt, otime); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, oheight); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, owidth); + } + if (indices != NULL) { + THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimN, nslices); + THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimt, otime); + THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimh, oheight); + THNN_CHECK_DIM_SIZE_INDICES(indices, ndim, dimw, owidth); + } +} + +static void THNN_(VolumetricDilatedMaxPooling_updateOutput_frame)( + real *input_p, + real *output_p, + THIndex_t *indz_p, + long nslices, + long itime, + long iwidth, + long iheight, + long otime, + long owidth, + long oheight, + int kT, + int kW, + int kH, + int dT, + int dW, + int dH, + int pT, + int pW, + int pH, + int dilationT, + int dilationW, + int dilationH) +{ + long k; +#pragma omp parallel for private(k) + for (k = 0; k < nslices; k++) + { + /* loop over output */ + long i, j, ti; + for (ti = 0; ti < otime; ti++) + { + for (i = 0; i < oheight; i++) + { + for (j = 0; j < owidth; j++) + { + /* local pointers */ + + long start_t = ti * dT - pT; + long start_h = i * dH - pH; + long start_w = j * dW - pW; + + long kernel_t = fminf(kT, kT + start_t); + long kernel_h = fminf(kH, kH + start_h); + long kernel_w = fminf(kW, kW + start_w); + + while(start_t < 0) + start_t += dilationT; + while(start_h < 0) + start_h += dilationH; + while(start_w < 0) + start_w += dilationW; + + real *ip = input_p + k * itime * iwidth * iheight + + start_t * iwidth * iheight + start_h * iwidth + start_w; + real *op = output_p + k * otime * owidth * oheight + + ti * owidth * oheight + i * owidth + j; + THIndex_t *indzp = indz_p + k * otime * owidth * oheight + + ti * owidth * oheight + i * owidth + j; + + /* compute local max: */ + real maxval = -THInf; + int x,y,z; + int mx, my, mz; + mx = my = mz = -1; + + for (z = 0; z < kernel_t; z++) + { + for (y = 0; y < kernel_h; y++) + { + for (x = 0; x < kernel_w; x++) + { + if ((start_t + z * dilationT < itime) && (start_h + y * dilationH < iheight) && (start_w + x * dilationW < iwidth)) + { + real val = *(ip + z * dilationT * iwidth * iheight + y * dilationH * iwidth + x * dilationW); + if (val > maxval) + { + maxval = val; + // Store indices w.r.t the kernel dimension + mz = z + (kT - kernel_t); + my = y + (kH - kernel_h); + mx = x + (kW - kernel_w); + } + } + } + } + } + + // set max values + ((unsigned char*)(indzp))[0] = mz; + ((unsigned char*)(indzp))[1] = my; + ((unsigned char*)(indzp))[2] = mx; + ((unsigned char*)(indzp))[3] = 0; + + /* set output to local max */ + *op = maxval; + } + } + } + } +} + +void THNN_(VolumetricDilatedMaxPooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THIndexTensor *indices, + int kT, + int kW, + int kH, + int dT, + int dW, + int dH, + int pT, + int pW, + int pH, + int dilationT, + int dilationW, + int dilationH, + bool ceilMode) +{ + long nslices; + long itime; + long iheight; + long iwidth; + long otime; + long oheight; + long owidth; + real *input_data; + real *output_data; + THIndex_t *indices_data; + + + int dimN = 0; + int dimt = 1; + int dimh = 2; + int dimw = 3; + + if (input->nDimension == 5) + { + dimN++; + dimt++; + dimh++; + dimw++; + } + + THNN_(VolumetricDilatedMaxPooling_shapeCheck)( + state, input, NULL, NULL, + kT, kW, kH, dT, dW, dH, + pT, pW, pH, dilationT, dilationW, dilationH, + ceilMode); + + /* sizes */ + nslices = input->size[dimN]; + itime = input->size[dimt]; + iheight = input->size[dimh]; + iwidth = input->size[dimw]; + if (ceilMode) + { + otime = (int)(ceil((float)(itime - (dilationT * (kT - 1) + 1) + 2*pT) / dT)) + 1; + oheight = (int)(ceil((float)(iheight - (dilationH * (kH - 1) + 1) + 2*pH) / dH)) + 1; + owidth = (int)(ceil((float)(iwidth - (dilationW * (kW - 1) + 1) + 2*pW) / dW)) + 1; + } + else + { + otime = (int)(floor((float)(itime - (dilationT * (kT - 1) + 1) + 2*pT) / dT)) + 1; + oheight = (int)(floor((float)(iheight - (dilationH * (kH - 1) + 1) + 2*pH) / dH)) + 1; + owidth = (int)(floor((float)(iwidth - (dilationW * (kW - 1) + 1) + 2*pW) / dW)) + 1; + } + + if (pT || pW || pH) + { + // ensure that the last pooling starts inside the image + if ((otime - 1)*dT >= itime + pT) + --otime; + if ((oheight - 1)*dH >= iheight + pH) + --oheight; + if ((owidth - 1)*dW >= iwidth + pW) + --owidth; + } + + /* get contiguous input */ + input = THTensor_(newContiguous)(input); + + if (input->nDimension == 4) /* non-batch mode */ + { + /* resize output */ + THTensor_(resize4d)(output, nslices, otime, oheight, owidth); + /* indices will contain ti,i,j uchar locations packed into float/double */ + THIndexTensor_(resize4d)(indices, nslices, otime, oheight, owidth); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + indices_data = THIndexTensor_(data)(indices); + + THNN_(VolumetricDilatedMaxPooling_updateOutput_frame)( + input_data, output_data, + indices_data, + nslices, + itime, iwidth, iheight, + otime, owidth, oheight, + kT, kW, kH, + dT, dW, dH, + pT, pW, pH, + dilationT, dilationW, dilationH + ); + } + else /* batch mode */ + { + long p; + long nBatch = input->size[0]; + + long istride = nslices * itime * iwidth * iheight; + long ostride = nslices * otime * owidth * oheight; + + /* resize output */ + THTensor_(resize5d)(output, nBatch, nslices, otime, oheight, owidth); + /* indices will contain ti,i,j locations for each output point */ + THIndexTensor_(resize5d)(indices, nBatch, nslices, otime, oheight, owidth); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + indices_data = THIndexTensor_(data)(indices); + +#pragma omp parallel for private(p) + for (p=0; p < nBatch; p++) + { + THNN_(VolumetricDilatedMaxPooling_updateOutput_frame)( + input_data + p * istride, + output_data + p * ostride, + indices_data + p * ostride, + nslices, + itime, iwidth, iheight, + otime, owidth, oheight, + kT, kW, kH, + dT, dW, dH, + pT, pW, pH, + dilationT, dilationW, dilationH + ); + } + } + + /* cleanup */ + THTensor_(free)(input); +} + +static void THNN_(VolumetricDilatedMaxPooling_updateGradInput_frame)( + real *gradInput_p, + real *gradOutput_p, + THIndex_t *indz_p, + long nslices, + long itime, + long iwidth, + long iheight, + long otime, + long owidth, + long oheight, + int dT, + int dW, + int dH, + int pT, + int pW, + int pH, + int dilationT, + int dilationW, + int dilationH) +{ + long k; +#pragma omp parallel for private(k) + for (k = 0; k < nslices; k++) + { + real *gradInput_p_k = gradInput_p + k * itime * iwidth * iheight; + real *gradOutput_p_k = gradOutput_p + k * otime * owidth * oheight; + THIndex_t *indz_p_k = indz_p + k * otime * owidth * oheight; + + /* calculate max points */ + long ti, i, j; + for (ti = 0; ti < otime; ti++) + { + for (i = 0; i < oheight; i++) + { + for (j = 0; j < owidth; j++) + { + /* retrieve position of max */ + THIndex_t * indzp = &indz_p_k[ti * oheight * owidth + i * owidth + j]; + long maxti = ((unsigned char*)(indzp))[0] * dilationT + ti * dT - pT; + long maxi = ((unsigned char*)(indzp))[1] * dilationH + i * dH - pH; + long maxj = ((unsigned char*)(indzp))[2] * dilationW + j * dW - pW; + + if (maxti != -1) { + /* update gradient */ + gradInput_p_k[maxti * iheight * iwidth + maxi * iwidth + maxj] += + gradOutput_p_k[ti * oheight * owidth + i * owidth + j]; + } + } + } + } + } +} + +void THNN_(VolumetricDilatedMaxPooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THIndexTensor *indices, + int kT, + int kW, + int kH, + int dT, + int dW, + int dH, + int pT, + int pW, + int pH, + int dilationT, + int dilationW, + int dilationH, + bool ceilMode) +{ + int nslices; + int itime; + int iheight; + int iwidth; + int otime; + int oheight; + int owidth; + real *gradInput_data; + real *gradOutput_data; + THIndex_t *indices_data; + + int dimN = 0; + int dimt = 1; + int dimh = 2; + int dimw = 3; + + THNN_(VolumetricDilatedMaxPooling_shapeCheck)( + state, input, gradOutput, indices, + kT, kW, kH, dT, dW, dH, + pT, pW, pH, dilationT, dilationW, dilationH, + ceilMode); + + // TODO: gradOutput shape check + /* get contiguous gradOutput */ + gradOutput = THTensor_(newContiguous)(gradOutput); + + /* resize */ + THTensor_(resizeAs)(gradInput, input); + THTensor_(zero)(gradInput); + + if (input->nDimension == 5) + { + dimN++; + dimt++; + dimh++; + dimw++; + } + + /* sizes */ + nslices = input->size[dimN]; + itime = input->size[dimt]; + iheight = input->size[dimh]; + iwidth = input->size[dimw]; + otime = gradOutput->size[dimt]; + oheight = gradOutput->size[dimh]; + owidth = gradOutput->size[dimw]; + + /* get raw pointers */ + gradInput_data = THTensor_(data)(gradInput); + gradOutput_data = THTensor_(data)(gradOutput); + indices_data = THIndexTensor_(data)(indices); + + /* backprop */ + if (input->nDimension == 4) /* non-batch mode*/ + { + THNN_(VolumetricDilatedMaxPooling_updateGradInput_frame)( + gradInput_data, gradOutput_data, + indices_data, + nslices, + itime, iwidth, iheight, + otime, owidth, oheight, + dT, dW, dH, + pT, pW, pH, + dilationT, dilationW, dilationH + ); + } + else /* batch mode */ + { + long p; + long nBatch = input->size[0]; + + long istride = nslices * itime * iwidth * iheight; + long ostride = nslices * otime * owidth * oheight; + +#pragma omp parallel for private(p) + for (p = 0; p < nBatch; p++) + { + THNN_(VolumetricDilatedMaxPooling_updateGradInput_frame)( + gradInput_data + p * istride, + gradOutput_data + p * ostride, + indices_data + p * ostride, + nslices, + itime, iwidth, iheight, + otime, owidth, oheight, + dT, dW, dH, + pT, pW, pH, + dilationT, dilationW, dilationH + ); + } + } + + /* cleanup */ + THTensor_(free)(gradOutput); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricFractionalMaxPooling.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricFractionalMaxPooling.c new file mode 100644 index 000000000..236986bb9 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricFractionalMaxPooling.c @@ -0,0 +1,279 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/VolumetricFractionalMaxPooling.c" +#else + +static long* THNN_(VolumetricFractionalMaxPooling_generateIntervals)( + real sample, + long inputSize, + long outputSize, + int poolSize) { + real alpha = (real) (inputSize - poolSize) / (real) (outputSize - 1); + long* sequence = (long*) THAlloc(sizeof(long) * outputSize); + + long i; + for (i = 0; i < outputSize - 1; ++i) { + sequence[i] = + (long) ((i + sample) * alpha) - (long) (sample * alpha); + } + sequence[outputSize - 1] = inputSize - poolSize; + + return sequence; +} + +static void THNN_(VolumetricFractionalMaxPooling_updateOutput_frame)( + real* input, + real* output, + THIndex_t* indices, + real* randomSamples, + long numPlanes, + long inputT, long inputW, long inputH, + long outputT, long outputW, long outputH, + int poolSizeT, int poolSizeW, int poolSizeH) { + long plane; +#pragma omp parallel for private(plane) + for (plane = 0; plane < numPlanes; ++plane) { + /* each plane contains 3 random samples, one for T, one for W, and one for H */ + real* randomSamplesForPlane = randomSamples + plane * 3; + + /* Generate interval sequence */ + long* sequenceT = + THNN_(VolumetricFractionalMaxPooling_generateIntervals)( + randomSamplesForPlane[0], inputT, outputT, poolSizeT); + long* sequenceW = + THNN_(VolumetricFractionalMaxPooling_generateIntervals)( + randomSamplesForPlane[1], inputW, outputW, poolSizeW); + long* sequenceH = + THNN_(VolumetricFractionalMaxPooling_generateIntervals)( + randomSamplesForPlane[2], inputH, outputH, poolSizeH); + + /* loop over output */ + long h, w, t; + + real* inputForPlane = input + plane * inputT * inputW * inputH; + real* outputForPlane = output + plane * outputT * outputW * outputH; + THIndex_t* indicesForPlane = indices + plane * outputT * outputW * outputH; + + for (h = 0; h < outputH; ++h) { + long inputHStart = sequenceH[h]; + + for (w = 0; w < outputW; ++w) { + long inputWStart = sequenceW[w]; + + for (t = 0; t < outputT; ++t) { + long inputTStart = sequenceT[t]; + + real maxVal = -THInf; + long maxIndex = -1; + + long h2, w2, t2; + for (h2 = inputHStart; h2 < inputHStart + poolSizeH; ++h2) { + for (w2 = inputWStart; w2 < inputWStart + poolSizeW; ++w2) { + for (t2 = inputTStart; t2 < inputTStart + poolSizeT; ++t2) { + THAssert(h2 >= 0 && h2 < inputH); + THAssert(w2 >= 0 && w2 < inputW); + THAssert(t2 >= 0 && t2 < inputT); + + long planeIndex = h2 * inputW * inputT + w2 * inputT + t2; + real val = inputForPlane[planeIndex]; + if (val > maxVal) { + maxVal = val; + maxIndex = planeIndex; + } + } + } + } + + THAssert(maxVal != -THInf); + THAssert(maxIndex != -1); + + outputForPlane[h * outputW * outputT + w * outputT + t] = maxVal; + /* +1 to lua index */ + indicesForPlane[h * outputW * outputT + w * outputT + t] = maxIndex + TH_INDEX_BASE; + } + } + } + + THFree(sequenceT); + THFree(sequenceW); + THFree(sequenceH); + } +} + +void THNN_(VolumetricFractionalMaxPooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int outputT, int outputW, int outputH, + int poolSizeT, int poolSizeW, int poolSizeH, + THIndexTensor *indices, + THTensor *randomSamples) { + + long numBatch = 1; + int planeDim = 0; + int heightDim = 1; + int widthDim = 2; + int timeDim = 3; + + long numInputDims = THTensor_(nDimension)(input); + THNN_ARGCHECK(numInputDims == 4 || numInputDims == 5, 2, input, + "4D or 5D (batch mode) tensor expected for input, but got: %s"); + + if (numInputDims == 5) { + numBatch = THTensor_(size)(input, 0); + planeDim++; + heightDim++; + widthDim++; + timeDim++; + } + + /* sizes */ + long numPlanes = THTensor_(size)(input, planeDim); + long inputH = THTensor_(size)(input, heightDim); + long inputW = THTensor_(size)(input, widthDim); + long inputT = THTensor_(size)(input, timeDim); + + THArgCheck(outputH + poolSizeH - 1 < inputH, 9, + "poolSizeH (%d) too large relative to input height (%d)", + poolSizeH, inputH); + THArgCheck(outputW + poolSizeW - 1 < inputW, 8, + "poolSizeW (%d) too large relative to input width (%d)", + poolSizeW, inputW); + THArgCheck(outputT + poolSizeT - 1 < inputT, 7, + "poolSizeT (%d) too large relative to input time (%d)", + poolSizeT, inputT); + + /* get contiguous input */ + input = THTensor_(newContiguous)(input); + + if (numInputDims == 4) { + /* resize output */ + THTensor_(resize4d)(output, numPlanes, outputH, outputW, outputT); + /* indices will contain the locations for each output point */ + THIndexTensor_(resize4d)(indices, numPlanes, outputH, outputW, outputT); + + THNN_(VolumetricFractionalMaxPooling_updateOutput_frame)( + THTensor_(data)(input), + THTensor_(data)(output), + THIndexTensor_(data)(indices), + THTensor_(data)(randomSamples), + numPlanes, inputT, inputW, inputH, + outputT, outputW, outputH, poolSizeT, poolSizeW, poolSizeH); + } else { + THTensor_(resize5d)(output, numBatch, numPlanes, outputH, outputW, outputT); + /* indices will contain the locations for each output point */ + THIndexTensor_(resize5d)(indices, numBatch, numPlanes, outputH, outputW, outputT); + + long batch; +#pragma omp parallel for private(batch) + for (batch = 0; batch < numBatch; ++batch) { + THNN_(VolumetricFractionalMaxPooling_updateOutput_frame)( + THTensor_(data)(input) + batch * numPlanes * inputH * inputW * inputT, + THTensor_(data)(output) + batch * numPlanes * outputH * outputW * outputT, + THIndexTensor_(data)(indices) + batch * numPlanes * outputH * outputW * outputT, + THTensor_(data)(randomSamples) + batch * numPlanes * 3, + numPlanes, inputT, inputW, inputH, + outputT, outputW, outputH, poolSizeT, poolSizeW, poolSizeH); + } + } + + /* cleanup */ + THTensor_(free)(input); +} + +static void THNN_(VolumetricFractionalMaxPooling_updateGradInput_frame)( + real* gradInput, + real* gradOutput, + THIndex_t* indices, + long numPlanes, + long inputT, long inputW, long inputH, + long outputT, long outputW, long outputH) { + long plane; +#pragma omp parallel for private(plane) + for (plane = 0; plane < numPlanes; plane++) { + real* gradInputForPlane = gradInput + plane * inputT * inputW * inputH; + real* gradOutputForPlane = gradOutput + plane * outputT * outputW * outputH; + THIndex_t* indicesForPlane = indices + plane * outputT * outputW * outputH; + + long h, w, t; + for (h = 0; h < outputH; ++h) { + for (w = 0; w < outputW; ++w) { + for (t = 0; t < outputT; ++t) { + long outputIndex = h * outputW * outputT + w * outputT + t; + long index = indicesForPlane[outputIndex] - TH_INDEX_BASE; + THAssert(index >= 0 && index < inputT * inputW * inputH); + + gradInputForPlane[index] += gradOutputForPlane[outputIndex]; + } + } + } + } +} + +void THNN_(VolumetricFractionalMaxPooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + int outputT, int outputW, int outputH, + int poolSizeT, int poolSizeW, int poolSizeH, + THIndexTensor *indices) { + + long numBatch = 1; + int planeDim = 0; + int heightDim = 1; + int widthDim = 2; + int timeDim = 3; + + long numInputDims = THTensor_(nDimension)(input); + if (numInputDims == 5) { + numBatch = THTensor_(size)(input, 0); + planeDim = 1; + heightDim++; + widthDim++; + timeDim++; + } + + /* sizes */ + long numPlanes = THTensor_(size)(input, planeDim); + long inputH = THTensor_(size)(input, heightDim); + long inputW = THTensor_(size)(input, widthDim); + long inputT = THTensor_(size)(input, timeDim); + + THArgCheck(outputT == THTensor_(size)(gradOutput, timeDim), 3, + "gradOutput time unexpected"); + THArgCheck(outputW == THTensor_(size)(gradOutput, widthDim), 3, + "gradOutput width unexpected"); + THArgCheck(outputH == THTensor_(size)(gradOutput, heightDim), 3, + "gradOutput height unexpected"); + + /* get contiguous gradOutput */ + gradOutput = THTensor_(newContiguous)(gradOutput); + + /* resize */ + THTensor_(resizeAs)(gradInput, input); + THTensor_(zero)(gradInput); + + /* backprop */ + if (numInputDims == 4) { + THNN_(VolumetricFractionalMaxPooling_updateGradInput_frame)( + THTensor_(data)(gradInput), + THTensor_(data)(gradOutput), + THIndexTensor_(data)(indices), + numPlanes, inputT, inputW, inputH, outputT, outputW, outputH); + } else { + long batch; +#pragma omp parallel for private(batch) + for (batch = 0; batch < numBatch; ++batch) { + THNN_(VolumetricFractionalMaxPooling_updateGradInput_frame)( + THTensor_(data)(gradInput) + batch * numPlanes * inputH * inputW * inputT, + THTensor_(data)(gradOutput) + batch * numPlanes * outputH * outputW * outputT, + THIndexTensor_(data)(indices) + batch * numPlanes * outputH * outputW * outputT, + numPlanes, inputT, inputW, inputH, outputT, outputW, outputH); + } + } + + /* cleanup */ + THTensor_(free)(gradOutput); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricFullConvolution.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricFullConvolution.c new file mode 100644 index 000000000..c974fab50 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricFullConvolution.c @@ -0,0 +1,541 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/VolumetricFullConvolution.c" +#else + +static void THNN_(vol2col)( + const real *data_vol, const int channels, + const int depth, const int height, const int width, + const int kT, const int kH, const int kW, + const int pT, const int pH, const int pW, + const int dT, const int dH, const int dW, + const int dilationT, const int dilationH, const int dilationW, + real *data_col) +{ + int c, t, h, w; + int depth_col = (depth + 2 * pT - (dilationT * (kT - 1) + 1)) / dT + 1; + int height_col = (height + 2 * pH - (dilationH * (kH - 1) + 1)) / dH + 1; + int width_col = (width + 2 * pW - (dilationW * (kW - 1) + 1)) / dW + 1; + int channels_col = channels * kT * kH * kW; + for (c = 0; c < channels_col; ++c) + { + int w_offset = c % kW; + int h_offset = (c / kW) % kH; + int t_offset = (c / kW / kH) % kT; + int c_vol = c / kT / kH / kW; + for (t = 0; t < depth_col; ++t) + { + for (h = 0; h < height_col; ++h) + { + for (w = 0; w < width_col; ++w) + { + int t_pad = t * dT - pT + t_offset * dilationT; + int h_pad = h * dH - pH + h_offset * dilationH; + int w_pad = w * dW - pW + w_offset * dilationW; + if (t_pad >= 0 && t_pad < depth && + h_pad >= 0 && h_pad < height && + w_pad >= 0 && w_pad < width) + data_col[((c * depth_col + t) * height_col + h) * width_col + w] = + data_vol[((c_vol * depth + t_pad) * height + h_pad) * width + w_pad]; + else + data_col[((c * depth_col + t) * height_col + h) * width_col + w] = 0; + } + } + } + } +} + +static void THNN_(col2vol)( + const real* data_col, const int channels, + const int depth, const int height, const int width, + const int kT, const int kH, const int kW, + const int pT, const int pH, const int pW, + const int dT, const int dH, const int dW, + const int dilationT, const int dilationH, const int dilationW, + real* data_vol) +{ + int c, t, h, w; + memset(data_vol, 0, sizeof(real) * depth * height * width * channels); + int depth_col = (depth + 2 * pT - (dilationT * (kT - 1) + 1)) / dT + 1; + int height_col = (height + 2 * pH - (dilationH * (kH - 1) + 1)) / dH + 1; + int width_col = (width + 2 * pW - (dilationW * (kW - 1) + 1)) / dW + 1; + int channels_col = channels * kT * kH * kW; + for (c = 0; c < channels_col; ++c) + { + int w_offset = c % kW; + int h_offset = (c / kW) % kH; + int t_offset = (c / kW / kH) % kT; + int c_vol = c / kT / kH / kW; + for (t = 0; t < depth_col; ++t) + { + for (h = 0; h < height_col; ++h) + { + for (w = 0; w < width_col; ++w) + { + int t_pad = t * dT - pT + t_offset * dilationT; + int h_pad = h * dH - pH + h_offset * dilationH; + int w_pad = w * dW - pW + w_offset * dilationW; + if (t_pad >= 0 && t_pad < depth && + h_pad >= 0 && h_pad < height && + w_pad >= 0 && w_pad < width) + data_vol[((c_vol * depth + t_pad) * height + h_pad) * width + w_pad] += + data_col[((c * depth_col + t) * height_col + h) * width_col + w]; + } + } + } + } +} + +static inline void THNN_(VolumetricFullConvolution_shapeCheck)( + THTensor *input, THTensor *gradOutput, + THTensor *weight, THTensor *bias, + int dT, int dW, int dH, int pT, int pW, int pH, + int aT, int aW, int aH) { + THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input, + "4D or 5D (batch mode) tensor expected for input, but got: %s"); + // number of input & output planes and kernel size is indirectly defined by the weight tensor + THNN_ARGCHECK(weight->nDimension == 5, 4, weight, + "5D (nOutputPlane x nInputPlane x kT x kH x kW) tensor " + "expected for weight, but got: %s"); + THArgCheck(dT > 0 && dW > 0 && dH > 0, 11, + "stride should be greater than zero, but got dT: %d dH: %d dW: %d", dT, dH, dW); + THArgCheck(aT < dT && aW < dW && aH < dH, 15, + "output adjustment must be smaller than stride, but got " + "adjT: %d adjH: %d adjW: %d dT: %d dH: %d dW: %d", + aT, aH, aW, dT, dH, dW); + + int ndim = input->nDimension; + const int nInputPlane = (int)weight->size[0]; + const int nOutputPlane = (int)weight->size[1]; + const int kT = (int)weight->size[2]; + const int kH = (int)weight->size[3]; + const int kW = (int)weight->size[4]; + + if (bias != NULL) { + THNN_CHECK_DIM_SIZE(bias, 1, 0, weight->size[1]); + } + + int dimf = 0; + int dimd = 1; + int dimh = 2; + int dimw = 3; + + if (ndim == 5) { + dimf++; + dimd++; + dimh++; + dimw++; + } + + const long inputWidth = input->size[dimw]; + const long inputHeight = input->size[dimh]; + const long inputDepth = input->size[dimd]; + const long outputWidth = (inputWidth - 1) * dW - 2*pW + kW + aW; + const long outputHeight = (inputHeight - 1) * dH - 2*pH + kH + aH; + const long outputDepth = (inputDepth - 1) * dT - 2*pT + kT + aT; + + if (outputDepth < 1 || outputWidth < 1 || outputHeight < 1) + THError("Given input size: (%dx%dx%dx%d). Calculated output size: (%dx%dx%dx%d). Output size is too small", + nInputPlane,inputDepth,inputHeight,inputWidth,nOutputPlane,outputDepth,outputHeight,outputWidth); + + THNN_CHECK_DIM_SIZE(input, ndim, dimf, nInputPlane); + if (gradOutput != NULL) { + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimf, nOutputPlane); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimd, outputDepth); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimh, outputHeight); + THNN_CHECK_DIM_SIZE(gradOutput, ndim, dimw, outputWidth); + } +} + +void THNN_(VolumetricFullConvolution_updateOutput)( + THNNState *state, + THTensor *input, // 4D or 5D (batch) tensor + THTensor *output, + THTensor *weight, // weight tensor (nInputPlane x nOutputPlane x kT x kH x kW) + THTensor *bias, + THTensor *finput, // internal columns buffer + THTensor *fgradInput, // internal ones buffer + int dT, int dW, int dH, // stride of the convolution + int pT, int pW, int pH, // padding + int aT, int aW, int aH) // extra output adjustment +{ + THTensor *columns = finput; + THTensor *ones = fgradInput; + + THNN_(VolumetricFullConvolution_shapeCheck)( + input, NULL, weight, bias, + dT, dW, dH, pT, pW, pH, aT, aW, aH); + + const int nInputPlane = (int)weight->size[0]; + const int nOutputPlane = (int)weight->size[1]; + const int kT = (int)weight->size[2]; + const int kH = (int)weight->size[3]; + const int kW = (int)weight->size[4]; + + input = THTensor_(newContiguous)(input); + weight = THTensor_(newContiguous)(weight); + bias = bias ? THTensor_(newContiguous)(bias) : bias; + int batch = 1; + if (input->nDimension == 4) + { + // Force batch + batch = 0; + THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); + } + + const long inputWidth = input->size[4]; + const long inputHeight = input->size[3]; + const long inputDepth = input->size[2]; + const long outputWidth = (inputWidth - 1) * dW - 2*pW + kW + aW; + const long outputHeight = (inputHeight - 1) * dH - 2*pH + kH + aH; + const long outputDepth = (inputDepth - 1) * dT - 2*pT + kT + aT; + + // Batch size + input planes + const long batchSize = input->size[0]; + + // Resize output + THTensor_(resize5d)(output, batchSize, nOutputPlane, outputDepth, outputHeight, outputWidth); + + // Resize temporary columns + THTensor_(resize2d)(columns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth); + THTensor_(zero)(columns); + + // Define a buffer of ones, for bias accumulation + // Note: this buffer can be shared with other modules, it only ever gets increased, + // and always contains ones. + if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) + { + // Resize plane and fill with ones... + THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth); + THTensor_(fill)(ones, 1); + } + + // Helpers + THTensor *input_n = THTensor_(new)(); + THTensor *output_n = THTensor_(new)(); + + int elt; + // For each elt in batch, do: + for (elt = 0; elt < batchSize; ++elt) + { + // Matrix mulitply per output: + THTensor_(select)(input_n, input, 0, elt); + THTensor_(select)(output_n, output, 0, elt); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + const long m = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4]; + const long n = columns->size[1]; + const long k = weight->size[0]; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + THBlas_(gemm)( + 'n', 't', + n, m, k, + 1, + THTensor_(data)(input_n), n, + THTensor_(data)(weight), m, + 0, + THTensor_(data)(columns), n + ); + + // Unpack columns back into input: + THNN_(col2vol)( + THTensor_(data)(columns), + nOutputPlane, outputDepth, outputHeight, outputWidth, + kT, kH, kW, + pT, pH, pW, + dT, dH, dW, + 1, 1, 1, + THTensor_(data)(output_n) + ); + + // Do Bias after: + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + const long m_ = nOutputPlane; + const long n_ = outputDepth * outputHeight * outputWidth; + const long k_ = 1; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + if (bias) { + THBlas_(gemm)( + 't', 'n', + n_, m_, k_, + 1, + THTensor_(data)(ones), k_, + THTensor_(data)(bias), k_, + 1, + THTensor_(data)(output_n), n_ + ); + } + } + + // Free + THTensor_(free)(input_n); + THTensor_(free)(output_n); + + // Resize output + if (batch == 0) + { + THTensor_(resize4d)(output, nOutputPlane, outputDepth, outputHeight, outputWidth); + THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth); + } + + THTensor_(free)(input); + THTensor_(free)(weight); + if (bias) THTensor_(free)(bias); +} + +void THNN_(VolumetricFullConvolution_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THTensor *weight, + THTensor *finput, + THTensor *fgradInput, // only used by cuda impl + int dT, int dW, int dH, // stride + int pT, int pW, int pH, // padding + int aT, int aW, int aH) // extra output adjustment +{ + THTensor *gradColumns = finput; + + // number of input & output planes and kernel size is indirectly defined by the weight tensor + THNN_(VolumetricFullConvolution_shapeCheck)( + input, gradOutput, weight, NULL, + dT, dW, dH, pT, pW, pH, aT, aW, aH); + + const int nInputPlane = (int)weight->size[0]; + const int nOutputPlane = (int)weight->size[1]; + const int kT = (int)weight->size[2]; + const int kH = (int)weight->size[3]; + const int kW = (int)weight->size[4]; + + input = THTensor_(newContiguous)(input); + weight = THTensor_(newContiguous)(weight); + gradOutput = THTensor_(newContiguous)(gradOutput); + + int batch = 1; + if (input->nDimension == 4) + { + // Force batch + batch = 0; + THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); + THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]); + } + + const long inputWidth = input->size[4]; + const long inputHeight = input->size[3]; + const long inputDepth = input->size[2]; + const long outputWidth = (inputWidth - 1) * dW - 2*pW + kW + aW; + const long outputHeight = (inputHeight - 1) * dH - 2*pH + kH + aH; + const long outputDepth = (inputDepth - 1) * dT - 2*pT + kT + aT; + + // Batch size + input planes + const long batchSize = input->size[0]; + + // Resize output + THTensor_(resize5d)(gradInput, batchSize, nInputPlane, inputDepth, inputHeight, inputWidth); + THTensor_(zero)(gradInput); + + // Resize temporary columns + THTensor_(resize2d)(gradColumns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth); + + // Helpers + THTensor *gradInput_n = THTensor_(new)(); + THTensor *gradOutput_n = THTensor_(new)(); + + int elt; + // For each elt in batch, do: + for (elt = 0; elt < batchSize; ++elt) + { + // Matrix mulitply per sample: + THTensor_(select)(gradInput_n, gradInput, 0, elt); + THTensor_(select)(gradOutput_n, gradOutput, 0, elt); + + // Extract columns: + THNN_(vol2col)( + THTensor_(data)(gradOutput_n), + nOutputPlane, outputDepth, outputHeight, outputWidth, + kT, kH, kW, + pT, pH, pW, + dT, dH, dW, + 1, 1, 1, + THTensor_(data)(gradColumns) + ); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + const long m = weight->size[0]; + const long n = gradColumns->size[1]; + const long k = weight->size[1] * weight->size[2] * weight->size[3] * weight->size[4]; + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + THBlas_(gemm)( + 'n', 'n', + n, m, k, + 1, + THTensor_(data)(gradColumns), n, + THTensor_(data)(weight), k, + 0, + THTensor_(data)(gradInput_n), n + ); + } + + // Free + THTensor_(free)(gradInput_n); + THTensor_(free)(gradOutput_n); + + // Resize output + if (batch == 0) + { + THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth); + THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth); + THTensor_(resize4d)(gradInput, nInputPlane, inputDepth, inputHeight, inputWidth); + } + + THTensor_(free)(input); + THTensor_(free)(gradOutput); + THTensor_(free)(weight); +} + +void THNN_(VolumetricFullConvolution_accGradParameters)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradWeight, + THTensor *gradBias, + THTensor *finput, + THTensor *fgradInput, + int dT, int dW, int dH, // stride + int pT, int pW, int pH, // padding + int aT, int aW, int aH, // extra output adjustment + accreal scale_) +{ + real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); + // number of input & output planes and kernel size is indirectly defined by the gradWeight tensor + THNN_(VolumetricFullConvolution_shapeCheck)( + input, gradOutput, gradWeight, gradBias, + dT, dW, dH, pT, pW, pH, aT, aW, aH); + + int nInputPlane = (int)gradWeight->size[0]; + int nOutputPlane = (int)gradWeight->size[1]; + int kT = (int)gradWeight->size[2]; + int kH = (int)gradWeight->size[3]; + int kW = (int)gradWeight->size[4]; + + THTensor *columns = finput; + THTensor *ones = fgradInput; + + input = THTensor_(newContiguous)(input); + gradOutput = THTensor_(newContiguous)(gradOutput); + THArgCheck(THTensor_(isContiguous)(gradWeight), 4, "gradWeight needs to be contiguous"); + if (gradBias) + THArgCheck(THTensor_(isContiguous)(gradBias), 5, "gradBias needs to be contiguous"); + + int batch = 1; + if (input->nDimension == 4) + { + // Force batch + batch = 0; + THTensor_(resize5d)(input, 1, input->size[0], input->size[1], input->size[2], input->size[3]); + THTensor_(resize5d)(gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2], gradOutput->size[3]); + } + + const long inputWidth = input->size[4]; + const long inputHeight = input->size[3]; + const long inputDepth = input->size[2]; + const long outputWidth = (inputWidth - 1) * dW - 2*pW + kW + aW; + const long outputHeight = (inputHeight - 1) * dH - 2*pH + kH + aH; + const long outputDepth = (inputDepth - 1) * dT - 2*pT + kT + aT; + + // Batch size + input planes + const long batchSize = input->size[0]; + + // Define a buffer of ones, for bias accumulation + if (ones->nDimension != 3 || ones->size[0]*ones->size[1]*ones->size[2] < outputDepth*outputHeight*outputWidth) + { + // Resize plane and fill with ones... + THTensor_(resize3d)(ones, outputDepth, outputHeight, outputWidth); + THTensor_(fill)(ones, 1); + } + + // Resize temporary columns + THTensor_(resize2d)(columns, nOutputPlane*kW*kH*kT, inputDepth*inputHeight*inputWidth); + + // Helpers + THTensor *input_n = THTensor_(new)(); + THTensor *gradOutput_n = THTensor_(new)(); + + int elt; + // For each elt in batch, do: + for (elt = 0; elt < batchSize; ++elt) + { + // Matrix mulitply per output: + THTensor_(select)(input_n, input, 0, elt); + THTensor_(select)(gradOutput_n, gradOutput, 0, elt); + + // Extract columns: + THNN_(vol2col)( + THTensor_(data)(gradOutput_n), nOutputPlane, + outputDepth, outputHeight, outputWidth, + kT, kH, kW, + pT, pH, pW, + dT, dH, dW, + 1, 1, 1, + THTensor_(data)(columns) + ); + + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + const long n = columns->size[0]; // nOutputPlane * kt * kh * kw + const long m = input_n->size[0]; // nInputPlane + const long k = columns->size[1]; // inputHeight * inputWidth + + // Do GEMM (note: this is a bit confusing because gemm assumes column-major matrices) + THBlas_(gemm)( + 't', 'n', + n, m, k, + scale, + THTensor_(data)(columns), k, + THTensor_(data)(input_n), k, + 1, + THTensor_(data)(gradWeight), n + ); + + // Do Bias: + // M,N,K are dims of matrix A and B + // (see http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm) + const long m_ = nOutputPlane; + const long k_ = outputDepth * outputHeight * outputWidth; + + // Do GEMV (note: this is a bit confusing because gemv assumes column-major matrices) + if (gradBias) { + THBlas_(gemv)( + 't', + k_, m_, + scale, + THTensor_(data)(gradOutput_n), k_, + THTensor_(data)(ones), 1, + 1, + THTensor_(data)(gradBias), 1 + ); + } + } + + // Free + THTensor_(free)(input_n); + THTensor_(free)(gradOutput_n); + + // Resize + if (batch == 0) + { + THTensor_(resize4d)(gradOutput, nOutputPlane, outputDepth, outputHeight, outputWidth); + THTensor_(resize4d)(input, nInputPlane, inputDepth, inputHeight, inputWidth); + } + + THTensor_(free)(input); + THTensor_(free)(gradOutput); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricMaxPooling.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricMaxPooling.c new file mode 100644 index 000000000..a3601e0b6 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricMaxPooling.c @@ -0,0 +1,50 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/VolumetricMaxPooling.c" +#else + +void THNN_(VolumetricMaxPooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THIndexTensor *indices, + int kT, + int kW, + int kH, + int dT, + int dW, + int dH, + int pT, + int pW, + int pH, + bool ceilMode) +{ + THNN_(VolumetricDilatedMaxPooling_updateOutput)( + state, input, output, indices, + kT, kW, kH, dT, dW, dH, + pT, pW, pH, 1, 1, 1, ceilMode); +} + +void THNN_(VolumetricMaxPooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THIndexTensor *indices, + int kT, + int kW, + int kH, + int dT, + int dW, + int dH, + int pT, + int pW, + int pH, + bool ceilMode) +{ + THNN_(VolumetricDilatedMaxPooling_updateGradInput)( + state, input, gradOutput, gradInput, indices, + kT, kW, kH, dT, dW, dH, + pT, pW, pH, 1, 1, 1, ceilMode); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricMaxUnpooling.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricMaxUnpooling.c new file mode 100644 index 000000000..d9d9e5951 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricMaxUnpooling.c @@ -0,0 +1,373 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/VolumetricMaxUnpooling.c" +#else + +static inline void THNN_(VolumetricMaxUnpooling_shapeCheck)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THIndexTensor *indices, + int oT, + int oW, + int oH, + int dT, + int dW, + int dH, + int pT, + int pW, + int pH) +{ + THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input, + "4D or 5D (batch mode) tensor expected for input, but got: %s"); + + THNN_CHECK_SHAPE_INDICES(input, indices); + + THArgCheck(dT > 0 && dW > 0 && dH > 0, 10, + "stride should be greater than zero, but got dT: %d dH: %d dW: %d", + dT, dH, dW); + + int dimw = 3; + int dimh = 2; + int dimt = 1; + int dimn = 0; + + if (input->nDimension == 5) + { + dimt++; + dimw++; + dimh++; + dimn++; + } + int nslices = input->size[dimn]; + + if (gradOutput != NULL) { + if (oT != gradOutput->size[dimt] || oW != gradOutput->size[dimw] || oH != gradOutput->size[dimh]) + { + THError( + "Inconsistent gradOutput size. oT= %d, oH= %d, oW= %d, gradOutput: %dx%dx%d", + oT, oH, oW, gradOutput->size[dimt], gradOutput->size[dimh], gradOutput->size[dimw] + ); + } + + THNN_CHECK_DIM_SIZE(gradOutput, input->nDimension, dimn, nslices); + } +} + +static void THNN_(VolumetricMaxUnpooling_updateOutput_frame)( + real *input_p, + real *output_p, + THIndex_t *ind_p, + int nslices, + int iT, + int iW, + int iH, + int oT, + int oW, + int oH, + int dT, + int dW, + int dH, + int pT, + int pW, + int pH) +{ + int k; + int has_error = 0; + THIndex_t error_index; +#pragma omp parallel for private(k) + for (k = 0; k < nslices; k++) + { + int ti, i, j, maxz, maxy, maxx; + for (ti = 0; ti < iT; ti++) + { + for (i = 0; i < iH; i++) + { + for (j = 0; j < iW; j++) + { + int start_t = ti * dT - pT; + int start_h = i * dH - pH; + int start_w = j * dW - pW; + + real *input_p_k = input_p + k*iT*iW*iH + ti*iW*iH + i*iW + j; + THIndex_t *ind_p_k = ind_p + k*iT*iW*iH + ti*iW*iH + i*iW + j; + + maxz = ((unsigned char*)(ind_p_k))[0]; /* retrieve position of max */ + maxy = ((unsigned char*)(ind_p_k))[1]; + maxx = ((unsigned char*)(ind_p_k))[2]; + + THIndex_t idx = k*oT*oW*oH + oH*oW*(start_t+maxz) + oW*(start_h+maxy) + (start_w+maxx); + if (start_t+maxz<0 || start_h+maxy<0 || start_w+maxx<0 || start_t+maxz>=oT + || start_h+maxy>=oH || start_w+maxx>=oW) + { +#pragma omp critical + { + has_error = 1; + error_index = idx; + } + } else { + output_p[idx] = *input_p_k; /* update output */ + } + } + } + } + } + if (has_error) { + THError( + "found an invalid max index %ld (output volumes are of size %dx%dx%d)", + error_index, oT, oH, oW + ); + } +} + +void THNN_(VolumetricMaxUnpooling_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + THIndexTensor *indices, + int oT, + int oW, + int oH, + int dT, + int dW, + int dH, + int pT, + int pW, + int pH) +{ + int dimw = 3; + int dimh = 2; + int dimt = 1; + int nbatch = 1; + int nslices; + int iT; + int iH; + int iW; + real *input_data; + real *output_data; + THIndex_t *indices_data; + + THNN_(VolumetricMaxUnpooling_shapeCheck)( + state, input, NULL, indices, + oT, oW, oH, dT, dW, dH, pT, pW, pH); + + if (input->nDimension == 5) + { + nbatch = input->size[0]; + dimt++; + dimw++; + dimh++; + } + + /* sizes */ + nslices = input->size[dimt-1]; + iT = input->size[dimt]; + iH = input->size[dimh]; + iW = input->size[dimw]; + + /* get contiguous input */ + input = THTensor_(newContiguous)(input); + indices = THIndexTensor_(newContiguous)(indices); + + /* resize output */ + if (input->nDimension == 4) + { + THTensor_(resize4d)(output, nslices, oT, oH, oW); + THTensor_(zero)(output); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + indices_data = THIndexTensor_(data)(indices); + + THNN_(VolumetricMaxUnpooling_updateOutput_frame)( + input_data, output_data, + indices_data, + nslices, + iT, iW, iH, + oT, oW, oH, + dT, dW, dH, pT, pW, pH + ); + } + else + { + int p; + + THTensor_(resize5d)(output, nbatch, nslices, oT, oH, oW); + THTensor_(zero)(output); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + indices_data = THIndexTensor_(data)(indices); + + for (p = 0; p < nbatch; p++) + { + THNN_(VolumetricMaxUnpooling_updateOutput_frame)( + input_data+p*nslices*iT*iW*iH, + output_data+p*nslices*oT*oW*oH, + indices_data+p*nslices*iT*iW*iH, + nslices, + iT, iW, iH, + oT, oW, oH, + dT, dW, dH, + pT, pW, pH + ); + } + } + + /* cleanup */ + THTensor_(free)(input); + THIndexTensor_(free)(indices); +} + +static void THNN_(VolumetricMaxUnpooling_updateGradInput_frame)( + real *gradInput_p, + real *gradOutput_p, + THIndex_t *ind_p, + int nslices, + int iT, + int iW, + int iH, + int oT, + int oW, + int oH, + int dT, + int dW, + int dH, + int pT, + int pW, + int pH) +{ + int k; +#pragma omp parallel for private(k) + for (k = 0; k < nslices; k++) + { + int ti, i, j, maxz, maxy, maxx; + for (ti = 0; ti < iT; ti++) + { + for (i = 0; i < iH; i++) + { + for (j = 0; j < iW; j++) + { + int start_t = ti * dT - pT; + int start_h = i * dH - pH; + int start_w = j * dW - pW; + + real *gradInput_p_k = gradInput_p + k*iT*iW*iH + ti*iW*iH + i*iW + j; + THIndex_t *ind_p_k = ind_p + k*iT*iW*iH + ti*iW*iH + i*iW + j; + + maxz = ((unsigned char*)(ind_p_k))[0]; /* retrieve position of max */ + maxy = ((unsigned char*)(ind_p_k))[1]; + maxx = ((unsigned char*)(ind_p_k))[2]; + + if (start_t+maxz<0 || start_h+maxy<0 || start_w+maxx<0 + || start_t+maxz>=oT || start_h+maxy>=oH || start_w+maxx>=oW) + { + THError( + "invalid max index z= %d, y= %d, x= %d, oT= %d, oW= %d, oH= %d", + start_t+maxz, start_h+maxy, start_w+maxx, oT, oW, oH + ); + } + *gradInput_p_k = gradOutput_p[k*oT*oW*oH + oH*oW*(start_t+maxz) + + oW*(start_h+maxy) + (start_w+maxx)]; /* update gradient */ + } + } + } + } +} + +void THNN_(VolumetricMaxUnpooling_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + THIndexTensor *indices, + int oT, + int oW, + int oH, + int dT, + int dW, + int dH, + int pT, + int pW, + int pH) +{ + int dimw = 3; + int dimh = 2; + int dimt = 1; + int nbatch = 1; + int nslices; + int iT; + int iH; + int iW; + real *gradInput_data; + real *gradOutput_data; + THIndex_t *indices_data; + + THNN_(VolumetricMaxUnpooling_shapeCheck)( + state, input, gradOutput, indices, + oT, oW, oH, dT, dW, dH, pT, pW, pH); + + // TODO: check gradOutput shape + /* get contiguous gradOutput */ + gradOutput = THTensor_(newContiguous)(gradOutput); + indices = THIndexTensor_(newContiguous)(indices); + + /* resize */ + THTensor_(resizeAs)(gradInput, input); + THTensor_(zero)(gradInput); + + if (input->nDimension == 5) + { + nbatch = input->size[0]; + dimt++; + dimw++; + dimh++; + } + + /* sizes */ + nslices = input->size[dimt-1]; + iT = input->size[dimt]; + iH = input->size[dimh]; + iW = input->size[dimw]; + + /* get raw pointers */ + gradInput_data = THTensor_(data)(gradInput); + gradOutput_data = THTensor_(data)(gradOutput); + indices_data = THIndexTensor_(data)(indices); + + /* backprop */ + if (input->nDimension == 4) + { + THNN_(VolumetricMaxUnpooling_updateGradInput_frame)( + gradInput_data, gradOutput_data, + indices_data, + nslices, + iT, iW, iH, + oT, oW, oH, + dT, dW, dH, + pT, pW, pH + ); + } + else + { + int p; + for (p = 0; p < nbatch; p++) + { + THNN_(VolumetricMaxUnpooling_updateGradInput_frame)( + gradInput_data+p*nslices*iT*iW*iH, + gradOutput_data+p*nslices*oT*oW*oH, + indices_data+p*nslices*iT*iW*iH, + nslices, + iT, iW, iH, + oT, oW, oH, + dT, dW, dH, + pT, pW, pH + ); + } + } + + /* cleanup */ + THTensor_(free)(gradOutput); + THIndexTensor_(free)(indices); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricReplicationPadding.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricReplicationPadding.c new file mode 100644 index 000000000..4d8993ec2 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricReplicationPadding.c @@ -0,0 +1,357 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/VolumetricReplicationPadding.c" +#else + +static inline void THNN_(VolumetricReplicationPadding_shapeCheck)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + int pleft, int pright, + int ptop, int pbottom, + int pfront, int pback) { + int dimw = 3; + int dimh = 2; + int dimd = 1; + int dimslices = 0; + long nslices; + long idepth; + long iheight; + long iwidth; + long odepth; + long oheight; + long owidth; + + THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input, + "4D or 5D (batch mode) tensor expected for input, but got: %s"); + + if (input->nDimension == 5) + { + dimw++; + dimh++; + dimd++; + dimslices++; + } + + /* sizes */ + nslices = input->size[dimslices]; + idepth = input->size[dimd]; + iheight = input->size[dimh]; + iwidth = input->size[dimw]; + odepth = idepth + pfront + pback; + oheight = iheight + ptop + pbottom; + owidth = iwidth + pleft + pright; + + THArgCheck(owidth >= 1 || oheight >= 1 || odepth >= 1, 2, + "input (D: %d H: %d, W: %d)is too small." + " Calculated output D: %d H: %d W: %d", + idepth, iheight, iwidth, odepth, oheight, owidth); + + if (gradOutput != NULL) { + THArgCheck(nslices == THTensor_(size)(gradOutput, dimslices), 3, + "gradOutput width unexpected. Expected: %d, Got: %d", + nslices, THTensor_(size)(gradOutput, dimslices)); + THArgCheck(owidth == THTensor_(size)(gradOutput, dimw), 3, + "gradOutput width unexpected. Expected: %d, Got: %d", + owidth, THTensor_(size)(gradOutput, dimw)); + THArgCheck(oheight == THTensor_(size)(gradOutput, dimh), 3, + "gradOutput height unexpected. Expected: %d, Got: %d", + oheight, THTensor_(size)(gradOutput, dimh)); + THArgCheck(odepth == THTensor_(size)(gradOutput, dimd), 3, + "gradOutput depth unexpected. Expected: %d, Got: %d", + odepth, THTensor_(size)(gradOutput, dimd)); + } +} + +static void THNN_(VolumetricReplicationPadding_updateOutput_frame)( + real *input_p, real *output_p, + long nslices, + long iwidth, long iheight, long idepth, + long owidth, long oheight, long odepth, + int pleft, int pright, + int ptop, int pbottom, + int pfront, int pback) +{ + int iStartX = fmax(0, -pleft); + int iStartY = fmax(0, -ptop); + int iStartZ = fmax(0, -pfront); + int oStartX = fmax(0, pleft); + int oStartY = fmax(0, ptop); + int oStartZ = fmax(0, pfront); + + long k, ip_x, ip_y, ip_z; +#pragma omp parallel for private(k, ip_x, ip_y, ip_z) + for (k = 0; k < nslices; k++) { + long i, j, z; + for (z = 0; z < odepth; z++) { + for (i = 0; i < oheight; i++) { + for (j = 0; j < owidth; j++) { + if (j < pleft) { + ip_x = pleft; + } else if (j >= pleft && j < iwidth + pleft) { + ip_x = j; + } else { + ip_x = iwidth + pleft - 1; + } + ip_x = ip_x - oStartX + iStartX; + + if (i < ptop) { + ip_y = ptop; + } else if (i >= ptop && i < iheight + ptop) { + ip_y = i; + } else { + ip_y = iheight + ptop - 1; + } + ip_y = ip_y - oStartY + iStartY; + + if (z < pfront) { + ip_z = pfront; + } else if (z >= pfront && z < idepth + pfront) { + ip_z = z; + } else { + ip_z = idepth + pfront - 1; + } + ip_z = ip_z - oStartZ + iStartZ; + + real *dest_p = output_p + k * owidth * oheight * odepth + + z * owidth * oheight + i * owidth + j; + real *src_p = input_p + k * iwidth * iheight * idepth + + ip_z * iwidth * iheight + ip_y * iwidth + ip_x; + *dest_p = *src_p; + } + } + } + } +} + +void THNN_(VolumetricReplicationPadding_updateOutput)(THNNState *state, + THTensor *input, + THTensor *output, + int pleft, int pright, + int ptop, int pbottom, + int pfront, int pback) +{ + int dimw = 3; + int dimh = 2; + int dimd = 1; + int dimslices = 0; + long nbatch = 1; + long nslices; + long idepth; + long iheight; + long iwidth; + long odepth; + long oheight; + long owidth; + real *input_data; + real *output_data; + +THNN_(VolumetricReplicationPadding_shapeCheck)( + state, input, NULL, pleft, pright, + ptop, pbottom, pfront, pback); + + if (input->nDimension == 5) + { + nbatch = input->size[0]; + dimw++; + dimh++; + dimd++; + dimslices++; + } + + /* sizes */ + nslices = input->size[dimslices]; + idepth = input->size[dimd]; + iheight = input->size[dimh]; + iwidth = input->size[dimw]; + odepth = idepth + pfront + pback; + oheight = iheight + ptop + pbottom; + owidth = iwidth + pleft + pright; + + /* get contiguous input */ + input = THTensor_(newContiguous)(input); + + /* resize output */ + if (input->nDimension == 4) + { + THTensor_(resize4d)(output, nslices, odepth, oheight, owidth); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + + THNN_(VolumetricReplicationPadding_updateOutput_frame)( + input_data, output_data, nslices, iwidth, iheight, idepth, + owidth, oheight, odepth, pleft, pright, ptop, pbottom, pfront, + pback); + } + else + { + long p; + + THTensor_(resize5d)(output, nbatch, nslices, odepth, oheight, owidth); + + input_data = THTensor_(data)(input); + output_data = THTensor_(data)(output); + +#pragma omp parallel for private(p) + for (p = 0; p < nbatch; p++) + { + THNN_(VolumetricReplicationPadding_updateOutput_frame)( + input_data + p * nslices * iwidth * iheight * idepth, + output_data + p * nslices * owidth * oheight * odepth, + nslices, + iwidth, iheight, idepth, + owidth, oheight, odepth, + pleft, pright, + ptop, pbottom, + pfront, pback); + } + } + + /* cleanup */ + THTensor_(free)(input); +} + +static void THNN_(VolumetricReplicationPadding_updateGradInput_frame)( + real *ginput_p, real *goutput_p, + long nslices, + long iwidth, long iheight, long idepth, + long owidth, long oheight, long odepth, + int pleft, int pright, + int ptop, int pbottom, + int pfront, int pback) +{ + int iStartX = fmax(0, -pleft); + int iStartY = fmax(0, -ptop); + int iStartZ = fmax(0, -pfront); + int oStartX = fmax(0, pleft); + int oStartY = fmax(0, ptop); + int oStartZ = fmax(0, pfront); + + long k, ip_x, ip_y, ip_z; +#pragma omp parallel for private(k, ip_x, ip_y, ip_z) + for (k = 0; k < nslices; k++) { + long i, j, z; + for (z = 0; z < odepth; z++) { + for (i = 0; i < oheight; i++) { + for (j = 0; j < owidth; j++) { + if (j < pleft) { + ip_x = pleft; + } else if (j >= pleft && j < iwidth + pleft) { + ip_x = j; + } else { + ip_x = iwidth + pleft - 1; + } + ip_x = ip_x - oStartX + iStartX; + + if (i < ptop) { + ip_y = ptop; + } else if (i >= ptop && i < iheight + ptop) { + ip_y = i; + } else { + ip_y = iheight + ptop - 1; + } + ip_y = ip_y - oStartY + iStartY; + + if (z < pfront) { + ip_z = pfront; + } else if (z >= pfront && z < idepth + pfront) { + ip_z = z; + } else { + ip_z = idepth + pfront - 1; + } + ip_z = ip_z - oStartZ + iStartZ; + + real *src_p = goutput_p + k * owidth * oheight * odepth + + z * owidth * oheight + i * owidth + j; + real *dest_p = ginput_p + k * iwidth * iheight * idepth + + ip_z * iwidth * iheight + ip_y * iwidth + ip_x; + *dest_p += *src_p; + } + } + } + } +} + +void THNN_(VolumetricReplicationPadding_updateGradInput)(THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + int pleft, int pright, + int ptop, int pbottom, + int pfront, int pback) +{ + int dimw = 3; + int dimh = 2; + int dimd = 1; + int dimslices = 0; + long nbatch = 1; + long nslices; + long idepth; + long iheight; + long iwidth; + long odepth; + long oheight; + long owidth; + + if (input->nDimension == 5) + { + nbatch = input->size[0]; + dimw++; + dimh++; + dimd++; + dimslices++; + } + + /* sizes */ + nslices = input->size[dimslices]; + idepth = input->size[dimd]; + iheight = input->size[dimh]; + iwidth = input->size[dimw]; + odepth = idepth + pfront + pback; + oheight = iheight + ptop + pbottom; + owidth = iwidth + pleft + pright; + + +THNN_(VolumetricReplicationPadding_shapeCheck)( + state, input, NULL, pleft, pright, + ptop, pbottom, pfront, pback); + + /* get contiguous gradOutput */ + gradOutput = THTensor_(newContiguous)(gradOutput); + + /* resize */ + THTensor_(resizeAs)(gradInput, input); + THTensor_(zero)(gradInput); + + /* backprop */ + if (input->nDimension == 4) { + THNN_(VolumetricReplicationPadding_updateGradInput_frame)( + THTensor_(data)(gradInput), + THTensor_(data)(gradOutput), + nslices, + iwidth, iheight, idepth, + owidth, oheight, odepth, + pleft, pright, + ptop, pbottom, + pfront, pback); + } else { + long p; +#pragma omp parallel for private(p) + for (p = 0; p < nbatch; p++) { + THNN_(VolumetricReplicationPadding_updateGradInput_frame)( + THTensor_(data)(gradInput) + p * nslices * idepth * iheight * iwidth, + THTensor_(data)(gradOutput) + p * nslices * odepth * oheight * owidth, + nslices, + iwidth, iheight, idepth, + owidth, oheight, odepth, + pleft, pright, + ptop, pbottom, + pfront, pback); + } + } + + /* cleanup */ + THTensor_(free)(gradOutput); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricUpSamplingNearest.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricUpSamplingNearest.c new file mode 100644 index 000000000..9068fb58d --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricUpSamplingNearest.c @@ -0,0 +1,226 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/VolumetricUpSamplingNearest.c" +#else + + +static inline void THNN_(VolumetricUpSamplingNearest_shapeCheck) + (THTensor *input, THTensor *gradOutput, + int scale_factor) { + THArgCheck(input != NULL, 2, "5D input tensor expected but got NULL"); + THArgCheck(scale_factor > 1, 4, + "scale_factor must be greater than 1, but got: %d", scale_factor); + THNN_ARGCHECK(input->nDimension == 4 || input->nDimension == 5, 2, input, + "4D or 5D input tensor expected but got: %s"); + if (input->nDimension == 4) { + int nChannels = THTensor_(size)(input, 0); + int inputDepth = THTensor_(size)(input, 1); + int inputHeight = THTensor_(size)(input, 2); + int inputWidth = THTensor_(size)(input, 3); + int outputDepth = inputDepth * scale_factor; + int outputHeight = inputHeight * scale_factor; + int outputWidth = inputWidth * scale_factor; + if (gradOutput != NULL) { + THNN_CHECK_DIM_SIZE(gradOutput, 4, 0, nChannels); + THNN_CHECK_DIM_SIZE(gradOutput, 4, 1, outputDepth); + THNN_CHECK_DIM_SIZE(gradOutput, 4, 2, outputHeight); + THNN_CHECK_DIM_SIZE(gradOutput, 4, 3, outputWidth); + } + } else { + int nBatch = THTensor_(size)(input, 0); + int nChannels = THTensor_(size)(input, 1); + int inputDepth = THTensor_(size)(input, 2); + int inputHeight = THTensor_(size)(input, 3); + int inputWidth = THTensor_(size)(input, 4); + int outputDepth = inputDepth * scale_factor; + int outputHeight = inputHeight * scale_factor; + int outputWidth = inputWidth * scale_factor; + if (gradOutput != NULL) { + THNN_CHECK_DIM_SIZE(gradOutput, 5, 0, nBatch); + THNN_CHECK_DIM_SIZE(gradOutput, 5, 1, nChannels); + THNN_CHECK_DIM_SIZE(gradOutput, 5, 2, outputDepth); + THNN_CHECK_DIM_SIZE(gradOutput, 5, 3, outputHeight); + THNN_CHECK_DIM_SIZE(gradOutput, 5, 4, outputWidth); + } + } +} + +void THNN_(VolumetricUpSamplingNearest_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int scale_factor) +{ + THNN_(VolumetricUpSamplingNearest_shapeCheck)(input, NULL, scale_factor); + int inputDepth = THTensor_(size)(input, input->nDimension-3); + int inputHeight = THTensor_(size)(input, input->nDimension-2); + int inputWidth = THTensor_(size)(input, input->nDimension-1); + int outputDepth = inputDepth * scale_factor; + int outputHeight = inputHeight * scale_factor; + int outputWidth = inputWidth * scale_factor; + + if (input->nDimension == 4) { + THTensor_(resize4d)(output, + THTensor_(size)(input, 0), + outputDepth, outputHeight, outputWidth); + } else { + THTensor_(resize5d)(output, + THTensor_(size)(input, 0), + THTensor_(size)(input, 1), + outputDepth, outputHeight, outputWidth); + } + + int dT = scale_factor; + int dW = scale_factor; + int dH = scale_factor; + int xDim = input->nDimension-3; + int yDim = input->nDimension-2; + int zDim = input->nDimension-1; + + // dims + int idim = input->nDimension; + int osz0 = output->size[0]; + int osz1 = output->size[1]; + int osz2 = output->size[2]; + int osz3 = output->size[3]; + int osz4 = 1; + if (idim > 4) { + osz4 = output->size[4]; + } + + // get strides + long *is = input->stride; + long *os = output->stride; + + // get raw pointers + real *pin = THTensor_(data)(input); + real *pout = THTensor_(data)(output); + + // perform the upsampling + int i0, i1, i2, i3, i4, isrc, idst; + int iout[5]; // Output indices + int iin[5]; // Input indices + + for (i0 = 0; i0 < osz0; i0++) { + iout[0] = i0; + iin[0] = i0; + for (i1 = 0; i1 < osz1; i1++) { + iout[1] = i1; + iin[1] = i1; + for (i2 = 0; i2 < osz2; i2++) { + iout[2] = i2; + iin[2] = i2; + for (i3 = 0; i3 < osz3; i3++) { + iout[3] = i3; + iin[3] = i3; + for (i4 = 0; i4 < osz4; i4++) { + iout[4] = i4; + iin[4] = i4; + + // set the indices for the upsampled dimensions + iin[xDim] = iout[xDim] / dW; + iin[yDim] = iout[yDim] / dH; + iin[zDim] = iout[zDim] / dT; + + idst = i0*os[0] + i1*os[1] + i2*os[2] + i3*os[3]; + isrc = iin[0]*is[0] + iin[1]*is[1] + iin[2]*is[2] + iin[3]*is[3]; + if (idim > 4) { + idst += i4*os[4]; + isrc += iin[4]*is[4]; + } + + pout[idst] = pin[isrc]; + } + } + } + } + } +} + +void THNN_(VolumetricUpSamplingNearest_updateGradInput)( + THNNState *state, + THTensor *input, + THTensor *gradOutput, + THTensor *gradInput, + int scale_factor) +{ + THNN_(VolumetricUpSamplingNearest_shapeCheck)(input, gradOutput, scale_factor); + THTensor_(resizeAs)(gradInput, input); + + int dW = scale_factor; + int dH = scale_factor; + int dT = scale_factor; + int xDim = gradInput->nDimension-3; + int yDim = gradInput->nDimension-2; + int zDim = gradInput->nDimension-1; + + // dims + int idim = gradInput->nDimension; // Guaranteed to be between 3 and 5 + int isz0 = gradInput->size[0]; + int isz1 = gradInput->size[1]; + int isz2 = gradInput->size[2]; + int isz3 = gradInput->size[3]; + int isz4 = 1; + if (idim > 4) { + isz4 = gradInput->size[4]; + } + + // get strides + long *is = gradInput->stride; + long *os = gradOutput->stride; + + // get raw pointers + real *pin = THTensor_(data)(gradInput); + real *pout = THTensor_(data)(gradOutput); + + // perform the upsampling + int i0, i1, i2, i3, i4, isrc, idst, x, y, z; + int iin[5]; // Input indices + int iout[5]; // Output indices + + THTensor_(zero)(gradInput); + + for (i0 = 0; i0 < isz0; i0++) { + iin[0] = i0; + iout[0] = i0; + for (i1 = 0; i1 < isz1; i1++) { + iin[1] = i1; + iout[1] = i1; + for (i2 = 0; i2 < isz2; i2++) { + iin[2] = i2; + iout[2] = i2; + for (i3 = 0; i3 < isz3; i3++) { + iin[3] = i3; + iout[3] = i3; + + for (i4 = 0; i4 < isz4; i4++) { + iin[4] = i4; + iout[4] = i4; + + idst = i0*is[0] + i1*is[1] + i2*is[2] + i3*is[3]; + if (idim > 4) { + idst += i4*is[4]; + } + + // Now accumulate the gradients from gradOutput + for (z = 0; z < dT; z++) { + for (y = 0; y < dH; y++) { + for (x = 0; x < dW; x++) { + iout[xDim] = dW * iin[xDim] + x; + iout[yDim] = dH * iin[yDim] + y; + iout[zDim] = dT * iin[zDim] + z; + isrc = iout[0]*os[0] + iout[1]*os[1] + iout[2]*os[2] + iout[3]*os[3]; + if (idim > 4) { + isrc += iout[4]*os[4]; + } + pin[idst] += pout[isrc]; + } + } + } + } + } + } + } + } +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/VolumetricUpSamplingTrilinear.c b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricUpSamplingTrilinear.c new file mode 100644 index 000000000..f2b04dba9 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/VolumetricUpSamplingTrilinear.c @@ -0,0 +1,213 @@ +// Adapted from interp.cpp from Caffe util by Pauline Luc +// Originally developed by George Papandreou + +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/VolumetricUpSamplingTrilinear.c" +#else + +static inline void THNN_(VolumetricUpSamplingTrilinear_shapeCheck) + (THTensor *input, THTensor *gradOutput, + int nBatch, int nChannels, + int inputDepth, int inputHeight, int inputWidth, + int outputDepth, int outputHeight, int outputWidth) { + THArgCheck(inputDepth > 0 && inputHeight > 0 && inputWidth > 0 + && outputDepth > 0 && outputHeight > 0 && outputWidth > 0, 2, + "input and output sizes should be greater than 0," + " but got input (D: %d, H: %d, W: %d) output (D: %d, H: %d, W: %d)", + inputDepth, inputHeight, inputWidth, outputDepth, outputHeight, outputWidth); + if (input != NULL) { + THNN_ARGCHECK(input->nDimension == 5, 2, input, + "5D input tensor expected but got: %s"); + } + + if (gradOutput != NULL) { + THNN_CHECK_DIM_SIZE(gradOutput, 5, 0, nBatch); + THNN_CHECK_DIM_SIZE(gradOutput, 5, 1, nChannels); + THNN_CHECK_DIM_SIZE(gradOutput, 5, 2, outputDepth); + THNN_CHECK_DIM_SIZE(gradOutput, 5, 3, outputHeight); + THNN_CHECK_DIM_SIZE(gradOutput, 5, 4, outputWidth); + } +} + +void THNN_(VolumetricUpSamplingTrilinear_updateOutput)( + THNNState *state, + THTensor *input, + THTensor *output, + int outputDepth, + int outputHeight, + int outputWidth){ + + int nbatch = THTensor_(size)(input, 0); + int channels = THTensor_(size)(input, 1); + int inputDepth = THTensor_(size)(input, 2); + int inputHeight = THTensor_(size)(input, 3); + int inputWidth = THTensor_(size)(input, 4); + + THNN_(VolumetricUpSamplingTrilinear_shapeCheck) + (input, NULL, + nbatch, channels, + inputDepth, inputHeight, inputWidth, + outputDepth, outputHeight, outputWidth); + + input = THTensor_(newContiguous)(input); + THTensor_(resize5d)(output, + THTensor_(size)(input, 0), + THTensor_(size)(input, 1), + outputDepth, outputHeight, outputWidth); + THTensor_(zero)(output); + real *idata = THTensor_(data)(input); + real *odata = THTensor_(data)(output); + channels = nbatch * channels; + THAssert(inputDepth > 0 && inputHeight > 0 && inputWidth > 0 && + outputDepth > 0 && outputHeight > 0 && outputWidth > 0); + // special case: just copy + if (inputDepth == outputDepth && inputHeight == outputHeight && inputWidth == outputWidth) { + for (int t2 = 0; t2 < outputDepth; ++t2) { + const int t1 = t2; + for (int h2 = 0; h2 < outputHeight; ++h2) { + const int h1 = h2; + for (int w2 = 0; w2 < outputWidth; ++w2) { + const int w1 = w2; + const real* pos1 = &idata[t1 * inputHeight * inputWidth + h1 * inputWidth + w1]; + real* pos2 = &odata[t2 * outputHeight * outputWidth + h2 * outputWidth + w2]; + for (int c = 0; c < channels; ++c) { + pos2[0] = pos1[0]; + pos1 += inputWidth * inputHeight * inputDepth; + pos2 += outputWidth * outputHeight * outputDepth; + } + } + } + } + return; + } + const float rdepth = (outputDepth > 1) ? (float)(inputDepth - 1)/(outputDepth - 1) : 0.f; + const float rheight = (outputHeight > 1) ? (float)(inputHeight - 1)/(outputHeight - 1) : 0.f; + const float rwidth = (outputWidth > 1) ? (float)(inputWidth - 1) / (outputWidth - 1) : 0.f; + for (int t2 = 0; t2 < outputDepth; ++t2) { + const float t1r = rdepth * t2; + const int t1 = t1r; + const int t1p = (t1 < inputDepth - 1) ? 1 : 0; + const real t1lambda = t1r - t1; + const real t0lambda = (real)1. - t1lambda; + for (int h2 = 0; h2 < outputHeight; ++h2) { + const float h1r = rheight * h2; + const int h1 = h1r; + const int h1p = (h1 < inputHeight - 1) ? 1 : 0; + const real h1lambda = h1r - h1; + const real h0lambda = (real)1. - h1lambda; + for (int w2 = 0; w2 < outputWidth; ++w2) { + const float w1r = rwidth * w2; + const int w1 = w1r; + const int w1p = (w1 < inputWidth - 1) ? 1 : 0; + const real w1lambda = w1r - w1; + const real w0lambda = (real)1. - w1lambda; + const real* pos1 = &idata[t1 * inputHeight * inputWidth + h1 * inputWidth + w1]; + real* pos2 = &odata[t2 * outputHeight * outputWidth + h2 * outputWidth + w2]; + for (int c = 0; c < channels; ++c) { + pos2[0] = t0lambda * (h0lambda * (w0lambda * pos1[0] + w1lambda * pos1[w1p]) + + h1lambda * (w0lambda * pos1[h1p * inputWidth] + + w1lambda * pos1[h1p * inputWidth + w1p])) + + t1lambda * (h0lambda * (w0lambda * pos1[t1p * inputHeight * inputWidth] + + w1lambda * pos1[t1p * inputHeight * inputWidth + + w1p]) + + h1lambda * (w0lambda * pos1[t1p * inputHeight * inputWidth + + h1p * inputWidth] + + w1lambda * pos1[t1p * inputHeight * inputWidth + + h1p * inputWidth + w1p])); + pos1 += inputWidth * inputHeight * inputDepth; + pos2 += outputWidth * outputHeight * outputDepth; + } + } + } + } + THTensor_(free)(input); +} + +void THNN_(VolumetricUpSamplingTrilinear_updateGradInput)( + THNNState *state, + THTensor *gradOutput, + THTensor *gradInput, + int nbatch, + int channels, + int inputDepth, + int inputHeight, + int inputWidth, + int outputDepth, + int outputHeight, + int outputWidth){ + + THNN_(VolumetricUpSamplingTrilinear_shapeCheck) + (NULL, gradOutput, + nbatch, channels, + inputDepth, inputHeight, inputWidth, + outputDepth, outputHeight, outputWidth); + + THTensor_(resize5d)(gradInput, nbatch, channels, inputDepth, inputHeight, inputWidth); + THTensor_(zero)(gradInput); + gradOutput = THTensor_(newContiguous)(gradOutput); + real *data1 = THTensor_(data)(gradInput); + real *data2 = THTensor_(data)(gradOutput); + channels = nbatch * channels; + + // special case: same-size matching grids + if (inputDepth == outputDepth && inputHeight == outputHeight && inputWidth == outputWidth) { + for (int t2 = 0; t2 < outputDepth; ++t2) { + const int t1 = t2; + for (int h2 = 0; h2 < outputHeight; ++h2) { + const int h1 = h2; + for (int w2 = 0; w2 < outputWidth; ++w2) { + const int w1 = w2; + real* pos1 = &data1[t1 * inputHeight * inputWidth + h1 * inputWidth + w1]; + const real* pos2 = &data2[t2 * outputHeight * outputWidth + h2 * outputWidth + w2]; + for (int c = 0; c < channels; ++c) { + pos1[0] += pos2[0]; + pos1 += inputWidth * inputHeight * inputDepth; + pos2 += outputWidth * outputHeight * outputDepth; + } + } + } + } + return; + } + const float rdepth = (outputDepth > 1) ? (float)(inputDepth - 1)/(outputDepth - 1) : 0.f; + const float rheight = (outputHeight > 1) ? (float)(inputHeight - 1)/(outputHeight - 1) : 0.f; + const float rwidth = (outputWidth > 1) ? (float)(inputWidth - 1)/(outputWidth - 1) : 0.f; + for (int t2 = 0; t2 < outputDepth; ++t2) { + const float t1r = rdepth * t2; + const int t1 = t1r; + const int t1p = (t1 < inputDepth - 1) ? 1 : 0; + const real t1lambda = t1r - t1; + const real t0lambda = (real)1. - t1lambda; + for (int h2 = 0; h2 < outputHeight; ++h2) { + const float h1r = rheight * h2; + const int h1 = h1r; + const int h1p = (h1 < inputHeight - 1) ? 1 : 0; + const real h1lambda = h1r - h1; + const real h0lambda = (real)1. - h1lambda; + for (int w2 = 0; w2 < outputWidth; ++w2) { + const float w1r = rwidth * w2; + const int w1 = w1r; + const int w1p = (w1 < inputWidth - 1) ? 1 : 0; + const real w1lambda = w1r - w1; + const real w0lambda = (real)1. - w1lambda; + real* pos1 = &data1[t1 * inputHeight * inputWidth + h1 * inputWidth + w1]; + const real* pos2 = &data2[t2 * outputHeight * outputWidth + h2 * outputWidth + w2]; + for (int c = 0; c < channels; ++c) { + pos1[0] += t0lambda * h0lambda * w0lambda * pos2[0]; + pos1[w1p] += t0lambda * h0lambda * w1lambda * pos2[0]; + pos1[h1p * inputWidth] += t0lambda * h1lambda * w0lambda * pos2[0]; + pos1[h1p * inputWidth + w1p] += t0lambda * h1lambda * w1lambda * pos2[0]; + pos1[t1p * inputHeight * inputWidth] += t1lambda * h0lambda * w0lambda * pos2[0]; + pos1[t1p * inputHeight * inputWidth + w1p] += t1lambda * h0lambda * w1lambda * pos2[0]; + pos1[t1p * inputHeight * inputWidth + h1p * inputWidth] += t1lambda * h1lambda * w0lambda * pos2[0]; + pos1[t1p * inputHeight * inputWidth + h1p * inputWidth + w1p] += t1lambda * h1lambda * w1lambda * pos2[0]; + pos1 += inputWidth * inputHeight * inputDepth; + pos2 += outputWidth * outputHeight * outputDepth; + } + } + } + } + THTensor_(free)(gradOutput); +} + +#endif diff --git a/contrib/lua-torch/nn/lib/THNN/generic/unfold.c b/contrib/lua-torch/nn/lib/THNN/generic/unfold.c new file mode 100644 index 000000000..14a73b567 --- /dev/null +++ b/contrib/lua-torch/nn/lib/THNN/generic/unfold.c @@ -0,0 +1,166 @@ +#ifndef TH_GENERIC_FILE +#define TH_GENERIC_FILE "generic/unfold.c" +#else + +/* note: due to write issues, this one cannot be parallelized as well as unfolded_copy */ +void THNN_(unfolded_acc)( + THTensor *finput, + THTensor *input, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH, + int nInputPlane, + int inputWidth, + int inputHeight, + int outputWidth, + int outputHeight) +{ + // This function assumes that + // outputHeight*dH does not overflow a long + // outputWidth*dW does not overflow a long + + int nip; + + real *input_data = THTensor_(data)(input); + real *finput_data = THTensor_(data)(finput); + +#pragma omp parallel for private(nip) + for(nip = 0; nip < nInputPlane; nip++) + { + int kw, kh, y, x; + long ix, iy; + for(kh = 0; kh < kH; kh++) + { + for(kw = 0; kw < kW; kw++) + { + real *src = finput_data + nip*((size_t)kH*kW*outputHeight*outputWidth) + kh*((size_t)kW*outputHeight*outputWidth) + kw*((size_t)outputHeight*outputWidth); + real *dst = input_data + nip*((size_t)inputHeight*inputWidth); + if (padW > 0 || padH > 0) { + int lpad,rpad; + for(y = 0; y < outputHeight; y++) { + iy = (long)y*dH - padH + kh; + if (iy < 0 || iy >= inputHeight) { + } else { + if (dW==1){ + ix = 0 - padW + kw; + lpad = fmaxf(0,padW-kw); + rpad = fmaxf(0,padW-(kW-kw-1)); + real *dst_slice = dst+(size_t)iy*inputWidth+ix+lpad; + THVector_(cadd)(dst_slice, dst_slice, src+(size_t)y*outputWidth+lpad, 1, outputWidth - lpad - rpad); /* note: THVector_add could handle 1 value better */ + } + else{ + for (x=0; x<outputWidth; x++){ + ix = (long)x*dW - padW + kw; + if (ix < 0 || ix >= inputWidth){ + }else{ + real *dst_slice = dst+(size_t)iy*inputWidth+ix; + THVector_(cadd)(dst_slice, dst_slice, src+(size_t)y*outputWidth+x, 1, 1); + } + } + } + } + } + } else { + for(y = 0; y < outputHeight; y++) { + iy = (long)y*dH + kh; + ix = 0 + kw; + if (dW == 1 ) { + real *dst_slice = dst+(size_t)iy*inputWidth+ix; + THVector_(cadd)(dst_slice, dst_slice, src+(size_t)y*outputWidth, 1, outputWidth); /* note: THVector_add could handle 1 value better */ + }else{ + for(x = 0; x < outputWidth; x++) { + real *dst_slice = dst+(size_t)iy*inputWidth+ix+x*dW; + THVector_(cadd)(dst_slice, dst_slice, src+(size_t)y*outputWidth+x, 1, 1); + } + } + } + } + } + } + } +} + +void THNN_(unfolded_copy)( + THTensor *finput, + THTensor *input, + int kW, + int kH, + int dW, + int dH, + int padW, + int padH, + int nInputPlane, + int inputWidth, + int inputHeight, + int outputWidth, + int outputHeight) +{ + // This function assumes that + // kH*kW does not overflow an int + // nInputPlane*kH*kW does not overflow a long + // outputHeight*dH does not overflow a long + // outputWidth*dW does not overflow a long + + long k; + real *input_data = THTensor_(data)(input); + real *finput_data = THTensor_(data)(finput); + +#pragma omp parallel for private(k) + for(k = 0; k < (long)nInputPlane*kH*kW; k++) { + long nip = k / (kH*kW); + long rest = k % (kH*kW); + long kh = rest / kW; + long kw = rest % kW; + int x, y; + long ix, iy; + real *dst = finput_data + nip*((size_t)kH*kW*outputHeight*outputWidth) + kh*((size_t)kW*outputHeight*outputWidth) + kw*((size_t)outputHeight*outputWidth); + real *src = input_data + nip*((size_t)inputHeight*inputWidth); + if (padW > 0 || padH > 0) { + long lpad,rpad; + for(y = 0; y < outputHeight; y++) { + iy = (long)y*dH - padH + kh; + if (iy < 0 || iy >= inputHeight) { + memset(dst+(size_t)y*outputWidth, 0, sizeof(real)*outputWidth); + } else { + if (dW==1){ + ix = 0 - padW + kw; + lpad = fmaxf(0,padW-kw); + rpad = fmaxf(0,padW-(kW-kw-1)); + if (outputWidth-rpad-lpad <= 0) { + memset(dst+(size_t)y*outputWidth, 0, sizeof(real)*outputWidth); + } else { + if (lpad > 0) memset(dst+(size_t)y*outputWidth, 0, sizeof(real)*lpad); + memcpy(dst+(size_t)y*outputWidth+lpad, src+(size_t)iy*inputWidth+ix+lpad, sizeof(real)*(outputWidth-rpad-lpad)); + if (rpad > 0) memset(dst+(size_t)y*outputWidth + outputWidth - rpad, 0, sizeof(real)*rpad); + } + } + else{ + for (x=0; x<outputWidth; x++){ + ix = (long)x*dW - padW + kw; + if (ix < 0 || ix >= inputWidth) + memset(dst+(size_t)y*outputWidth+x, 0, sizeof(real)*1); + else + memcpy(dst+(size_t)y*outputWidth+x, src+(size_t)iy*inputWidth+ix, sizeof(real)*(1)); + } + } + } + } + } else { + for(y = 0; y < outputHeight; y++) { + iy = (long)y*dH + kh; + ix = 0 + kw; + if (dW == 1) + memcpy(dst+(size_t)y*outputWidth, src+(size_t)iy*inputWidth+ix, sizeof(real)*outputWidth); + else{ + for (x=0; x<outputWidth; x++) + memcpy(dst+(size_t)y*outputWidth+x, src+(size_t)iy*inputWidth+ix+(long)x*dW, sizeof(real)*(1)); + } + } + } + } +} + +#endif |