#ifndef TH_GENERIC_FILE
#define TH_GENERIC_FILE "generic/IndexLinear.c"
#else

#ifdef _OPENMP
#include <omp.h>
#endif

/* Threshold used to trigger multithreading */
#ifndef THNN_SPARSE_OMP_THRESHOLD
#define THNN_SPARSE_OMP_THRESHOLD 100000
#endif

/* Threshold used to trigger BLAS axpy call */
#ifndef THNN_SPARSE_OUTDIM_THRESHOLD
#define THNN_SPARSE_OUTDIM_THRESHOLD 49
#endif

/* sign MACRO */
#ifndef THNN_INDEXLINEAR_SIGN
#define THNN_INDEXLINEAR_SIGN(a) ( ( (a) < 0 )  ?  -1   : ( (a) > 0 ) )
#endif

static bool THNN_(checkKeysValues)(THLongTensor* keys, THTensor* values)
{
  return THLongTensor_size(keys, 0) == THTensor_(nElement)(values)
                && THTensor_(nDimension)(values) == 1
                && THLongTensor_nDimension(keys) == 1;
}

void THNN_(IndexLinear_updateOutput)(
          THNNState *state,
          THLongTensor *keys,
          long keysOffset,
          THTensor *values,
          THLongTensor *sizes,
          THLongTensor *cumSumSizes,
          THTensor *output,
          THTensor *weight,
          THTensor *bias,
          THTensor *normalizedValues,
          int  train)
{
  /* Retrieve all the dimensions of the problem */
  long batchSize = THLongTensor_size(sizes, 0);
  long keysSize = THLongTensor_size(keys, 0);
  long outDim = THTensor_(size)(bias, 0);
  long woutDim = THTensor_(size)(weight, 1);
  int maxNormalize = woutDim - outDim;
  long* sizesData = THLongTensor_data(sizes);
  long* cumSumSizesData = THLongTensor_data(cumSumSizes);

  /* Define/resize the normalized values tensor if maxNormalize is  > 0 */
  real* normalizedValuesData = NULL;
  if (maxNormalize)
  {
    THTensor_(resize1d)(normalizedValues, keysSize);
    normalizedValuesData = THTensor_(data)(normalizedValues);
  }

  /* Resize the output */
  THTensor_(resize2d)(output, batchSize, outDim);

  /* Access the storage data/strides */
  real* outputData = THTensor_(data)(output);
  real* valuesData = THTensor_(data)(values);
  real* weightData = THTensor_(data)(weight);
  long weightStride0 = weight->stride[0];
  real* biasData = THTensor_(data)(bias);
  long* keysData = THLongTensor_data(keys);

  /* Make sure these inputs are contiguous to accelerate computations */
  THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(output), 6, "output vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(weight), 7, "weight matrix must be contiguous");
  THArgCheck(THTensor_(isContiguous)(bias), 8, "bias vector must be contiguous");
  THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements");
  THArgCheck(THTensor_(isContiguous)(normalizedValues), 9, "normalizedValues vector must be contiguous");
  long i,j,k;

  /* Separate cases: output dimension is == 1, or > 1
   * This allows for some optimizations. */
  if (outDim == 1)
  {
    THVector_(fill)(outputData, *biasData, batchSize);
    if (maxNormalize)
    {
      /* Parallelize on the batch itself */
#pragma omp parallel                                                    \
    for private(i,j)                                                    \
    firstprivate(outDim, keysOffset,                                    \
                 weightData, keysData,                                  \
                 valuesData, outputData,                                \
                 cumSumSizesData, sizesData)                            \
    schedule(static)                                                    \
    if(keysSize*outDim > THNN_SPARSE_OMP_THRESHOLD && batchSize > 1)
      for (j = 0; j < batchSize; j++)
      {
        real* loutputData = outputData + j;
        real val = 0;
        real absVal = 0;
        long offset = j == 0 ? 0 : cumSumSizesData[j - 1];

        for (i = 0; i < sizesData[j]; i++)
        {
          long woffset = weightStride0*(keysData[offset] + keysOffset);
          absVal = fabs(valuesData[offset]);
          if (train)
          {
            if (absVal > weightData[woffset])
            {
              weightData[woffset] = absVal;
              weightData[woffset+1] = 1/absVal;
            }

            /*
             * The following can be used to scale the size of the updates
             * depending on some rule, e.g. the frequency of a feature, ...
             * This is used at update time.
             * TODO: implement a smarter update scale.
             */
            weightData[woffset+2] = 1;
          }
          normalizedValuesData[offset] = (absVal > weightData[woffset] ? THNN_INDEXLINEAR_SIGN(valuesData[offset]):valuesData[offset]*weightData[woffset+1]) + weightData[woffset+3];
          val += normalizedValuesData[offset] * weightData[woffset+maxNormalize];
          offset++;
        }
        *loutputData += val;
      }
    }
    else
    {
      /* Parallelize on the batch itself */
#pragma omp parallel                                                    \
    for private(i,j)                                                    \
    firstprivate(outDim, weightData,                                    \
                 keysData, valuesData,                                  \
                 outputData, cumSumSizesData,                           \
                 sizesData)                                             \
    schedule(static)                                                    \
    if(keysSize*outDim > THNN_SPARSE_OMP_THRESHOLD && batchSize > 1)
      for (j = 0; j < batchSize; j++)
      {
        long offset = j == 0 ? 0 : cumSumSizesData[j - 1];
        real* loutputData = outputData + j;
        real val = 0;

        for (i = 0; i < sizesData[j]; i++)
        {
          val += weightData[weightStride0*(keysData[offset] + keysOffset)] * valuesData[offset];
          offset++;
        }
        *loutputData += val;
      }
    }
  }
  else {
#pragma omp parallel                                                    \
    for private(i,j,k)                                                  \
    firstprivate(outDim, weightData,                                    \
                 keysData, valuesData,                                  \
                 biasData, outputData,                                  \
                 cumSumSizesData, sizesData)                            \
    schedule(static)                                                    \
    if(keysSize*outDim > THNN_SPARSE_OMP_THRESHOLD && batchSize > 1)
    for (j = 0; j < batchSize; j++)
    {
      long offset = j == 0 ? 0 : cumSumSizesData[j -  1];
      real val = 0;
      real* loutputData = outputData + j*outDim;
      real* lweightData = weightData;
      memcpy(loutputData, biasData, outDim*sizeof(real));
      for (i = 0; i < sizesData[j]; i++)
      {
        real val;
        long woffset = weightStride0*(keysData[offset] + keysOffset);
        if (maxNormalize)
        {
          val = valuesData[offset];
          real absVal = fabs(val);
          if (train)
          {
            if (absVal > weightData[woffset])
            {
              weightData[woffset] = absVal;
              weightData[woffset+1] = 1/absVal;
            }

            /*
             * The following can be used to scale the size of the updates
             * depending on some rule, e.g. the frequency of a feature, ...
             * The commented section thereafter is just an example of what can be done:
             *
             *```
             * weightData[woffset+2] = weightData[woffset+2]==0?1:(weightData[woffset+2] / (weightData[woffset+2] + 1));
             * real alpha = 1;
             * real beta = 0.01;
             * real gamma = 1 - 0.000001;
             * real l = weightData[woffset+2]==0?1/gamma:(weightData[woffset+2] - beta) / (alpha - beta);
             * l = gamma*l;
             * weightData[woffset+2] = (alpha-beta)*l + beta;
             * ```
             *
             * TODO: implement a smarter update scale.
             */
            weightData[woffset+2] = 1;
          }

          /* Normalize + Clamp */
          val = (absVal > weightData[woffset] ? THNN_INDEXLINEAR_SIGN(val):val*weightData[woffset+1]) + weightData[woffset+3];
          normalizedValuesData[offset] = val;

          lweightData = weightData + woffset + maxNormalize;
        }
        else
        {
          val = valuesData[offset];
          lweightData = weightData + woffset;
        }
        if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD)
        {
          THBlas_(axpy)(outDim, val, lweightData, 1, loutputData, 1);
        }
        else
        {
          for (k=0; k < outDim; k++)
          {
            loutputData[k] += lweightData[k] * val;
          }
        }
        offset++;
      }
    }
  }
  return;
}

void THNN_(IndexLinear_updateParameters)(
          THNNState *state,
          THTensor *gradWeight,
          THTensor *gradBias,
          THTensor *weight,
          THTensor *bias,
          THLongTensor *runningKeys,
          THLongTensor *cumSumSizes,
          long keysOffset,
          accreal weightDecay_,
          accreal learningRate_)
{
  real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
  real learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_);
  /* Retrieve all the dimensions of the problem */
  long outDim = THTensor_(size)(bias, 0);
  long woutDim = THTensor_(size)(weight, 1);
  int maxNormalize = woutDim - outDim;
  long keysSize = THLongTensor_size(runningKeys, 0);

  /* Access the storage data/strides */
  real* gradWeightData = THTensor_(data)(gradWeight);
  real* weightData = THTensor_(data)(weight);
  long weightStride0 = weight->stride[0];
  real* gradBiasData = THTensor_(data)(gradBias);
  real* biasData = THTensor_(data)(bias);
  long* keysData = THLongTensor_data(runningKeys);

  /* Make sure these inputs are contiguous to accelerate computations */
  THArgCheck(THTensor_(isContiguous)(gradWeight), 1, "gradWeight must be contiguous");
  THArgCheck(THTensor_(isContiguous)(gradBias), 2, "gradBias vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(weight), 3, "gradBias vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(bias), 4, "gradBias vector must be contiguous");
  THArgCheck(THLongTensor_isContiguous(runningKeys), 5, "keys vector must be contiguous");

  int j,k;
  long offset = 0;

  /* Update the bias first */
  THVector_(cadd)(biasData, biasData, gradBiasData, -learningRate, outDim);

  /* Separate cases: output dimension is == 1, or > 1
   * This allows for some optimizations.
   * No multithreading here as this could
   * corrupt the results (hogwild style) */
  if (outDim == 1)
  {
    if (maxNormalize)
    {
      if (weightDecay)
      {
        for (j = 0; j < keysSize; j++)
        {
          long woffset = weightStride0*(keysData[j] + keysOffset) + maxNormalize;
          real lr = learningRate*weightData[woffset-2];
          weightData[woffset-1] -= weightData[woffset]*gradWeightData[2*j]*lr;
          weightData[woffset] -= gradWeightData[2*j+1]*lr - weightDecay * weightData[woffset-2] * weightData[woffset];
        }
      }
      else
      {
        for (j = 0; j < keysSize; j++)
        {
          long woffset = weightStride0*(keysData[j] + keysOffset) + maxNormalize;
          real lr = learningRate*weightData[woffset-2];
          weightData[woffset-1] -= weightData[woffset]*gradWeightData[2*j]*lr;
          weightData[woffset] -= gradWeightData[2*j+1]*lr;
        }
      }
    }
    else
    {
      if (weightDecay)
      {
        for (j = 0; j < keysSize; j++)
        {
          long woffset = weightStride0*(keysData[j] + keysOffset);
          weightData[woffset] -= gradWeightData[j]*learningRate + weightDecay * weightData[woffset];
        }
      }
      else
      {
        for (j = 0; j < keysSize; j++)
        {
          weightData[weightStride0*(keysData[j] + keysOffset)] -= gradWeightData[j]*learningRate;
        }
      }
    }
  }
  else
  {
    for (j = 0; j < keysSize; j++)
    {
      real lr = learningRate;
      real wd = weightDecay;
      real* lweightData;
      long woffset = weightStride0*(keysData[j] + keysOffset);
      real* lgradWeightData = gradWeightData + j*outDim;
      if (maxNormalize)
      {
        lgradWeightData += j*outDim;
        /* weightData[woffset + 2] */
        lweightData = weightData + woffset + maxNormalize - 2;
        lr = lr*lweightData[0];
        wd = weightDecay*lweightData[0];
        /* weightData[woffset + 3] */
        lweightData++;
        for (k=0; k < outDim; k++)
        {
            lweightData[0] -= lgradWeightData[k]*lweightData[k+1]*lr;
        }
        lweightData++;
        lgradWeightData += outDim;
      }
      else
      {
        lweightData = weightData + woffset;
      }

      /* We do sparse weight decay.
       * We think it makes more sense. */
      if (weightDecay)
      {
        for (k=0; k < outDim; k++)
        {
            lweightData[k] -= lweightData[k]*wd;
        }
      }

      if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD)
      {
        THBlas_(axpy)(outDim, -lr, lgradWeightData, 1, lweightData, 1);
      }
      else
      {
        for (k=0; k < outDim; k++)
        {
          lweightData[k] -= lgradWeightData[k]*lr;
        }
      }
    }
  }
}


void THNN_(IndexLinear_accUpdateGradParameters)(
          THNNState *state,
          THLongTensor *keys,
          long keysOffset,
          THTensor *values,
          THLongTensor *sizes,
          THLongTensor *cumSumSizes,
          THTensor *gradOutput,
          THTensor *weight,
          THTensor *bias,
          accreal weightDecay_,
          accreal scale_)
{
  real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
  /* Retrieve all the dimensions of the problem */
  long batchSize = THLongTensor_size(sizes, 0);
  long keysSize = THLongTensor_size(keys, 0);
  long outDim = THTensor_(size)(bias, 0);
  long woutDim = THTensor_(size)(weight, 1);
  int maxNormalize = woutDim - outDim;
  THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements");

  /* Access the storage data/strides */
  real* gradOutputData = THTensor_(data)(gradOutput);
  real* valuesData =THTensor_(data)(values);
  real* weightData = THTensor_(data)(weight);
  real* biasData = THTensor_(data)(bias);
  long weightStride0 = weight->stride[0];
  long biasStride = bias->stride[0];
  long* keysData = THLongTensor_data(keys);
  long* sizesData = THLongTensor_data(sizes);

  /* Make sure these inputs are contiguous to accelerate computations */
  THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(gradOutput), 6, "gradOutput vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(weight), 7, "weight matrix must be contiguous");
  THArgCheck(THTensor_(isContiguous)(bias), 8, "bias matrix must be contiguous");

  int i,j,k;

  /* Separate cases: output dimension is == 1, or > 1
   * This allows for some optimizations.
   * No multithreading here as this could
   * corrupt the results (hogwild style) */
  if (outDim == 1)
  {
    if (maxNormalize)
    {
        long offset = 0;
        for (j = 0; j < batchSize; j++)
        {
          real* lgradOutputData = gradOutputData + j;
          *biasData -= *lgradOutputData * scale;
          real val = *lgradOutputData * scale;
          real* lweightData = weightData;
          for (i = 0; i < sizesData[j]; i++)
          {
            long idx = weightStride0*(keysData[offset] + keysOffset) + maxNormalize;
            weightData[idx-1] -= weightData[idx]*val*weightData[idx-2];
            weightData[idx] -= (val*valuesData[offset] - weightDecay * weightData[idx])*weightData[idx-2];
            offset++;
          }
        }

        offset = 0;
        for (j = 0; j < batchSize; j++)
        {
          real* lweightData = weightData;
          for (i = 0; i < sizesData[j]; i++)
          {
            long idx = weightStride0*(keysData[offset] + keysOffset) + maxNormalize;
            weightData[idx-2] = 0;
            offset++;
          }
        }
    }
    else
    {
      if (weightDecay)
      {
        long offset = 0;
        for (j = 0; j < batchSize; j++)
        {
          real* lgradOutputData = gradOutputData + j;
          *biasData -= *lgradOutputData * scale;
          real val = *lgradOutputData * scale;
          real* lweightData = weightData;
          for (i = 0; i < sizesData[j]; i++)
          {
            long idx = weightStride0*(keysData[offset] + keysOffset);
            weightData[idx] -= val * valuesData[offset] + weightData[idx] * weightDecay;
            offset++;
          }
        }
      }
      else
      {
        long offset = 0;
        for (j = 0; j < batchSize; j++)
        {
          real val = gradOutputData[j] * scale;
          for (i = 0; i < sizesData[j]; i++)
          {
            weightData[(keysData[offset] + keysOffset)*weightStride0] -= val * valuesData[offset];
            offset++;
          }
          *biasData -= val;
        }
      }
    }
  }
  else {
    long offset = 0;
    for (j = 0; j < batchSize; j++)
    {
      real val = 0;
      real* lgradOutputData = gradOutputData + j*outDim;
      real* lweightData = weightData;
      THVector_(cadd)(biasData, biasData, lgradOutputData, -scale, outDim);
      for (i = 0; i < sizesData[j]; i++)
      {
        real val = valuesData[offset] * scale;
        real wd = weightDecay;

        // Max normalize case
        if (maxNormalize)
        {
          lweightData = weightData + weightStride0*(keysData[offset] + keysOffset) + (maxNormalize-2);
          val *= lweightData[0];
          wd *= lweightData[0];
          for (k=0; k < outDim; k++)
          {
            lweightData[1] -= lweightData[k+2]*scale*lgradOutputData[k]*lweightData[0];
          }
          lweightData += 2;
        }
        else
        {
          lweightData = weightData + weightStride0*(keysData[offset] + keysOffset);
        }

        /* We do sparse weight decay.
         * We think it makes more sense. */
        if (weightDecay)
        {
          if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD)
          {
            THBlas_(axpy)(outDim, -wd, lweightData, 1, lweightData, 1);
          }
          else
          {
            for (k=0; k < outDim; k++)
            {
              lweightData[k] -= wd * lweightData[k];
            }
          }
        }

        if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD)
        {
          THBlas_(axpy)(outDim, -val, lgradOutputData, 1, lweightData, 1);
        }
        else
        {
          for (k=0; k < outDim; k++)
          {
            lweightData[k] -= val * lgradOutputData[k];
          }
        }
        offset++;
      }
    }

    /* Max Normalize case:
     * Reset the smart update scaling if
     * one does it batch-wise.
     * TODO: Decide what to do with that piece of code.
     * NB: If the code belowe is uncommented, so should the commented
     * code in IndexLinear:zeroGradParameters() */

    /*
    if (maxNormalize)
    {
      offset = 0;
      for (j = 0; j < batchSize; j++)
      {
        real* lweightData = weightData;
        for (i = 0; i < sizesData[j]; i++)
        {
          real val = valuesData[offset] * scale;
          real wd = weightDecay;

          lweightData = weightData + weightStride0*(keysData[offset] + keysOffset) + (maxNormalize-2);
          lweightData[0] = 0;
          offset++;
        }
      }
    }
    */
  }
  return;
}

void THNN_(IndexLinear_accGradParameters)(
          THNNState *state,
          THLongTensor *keys,
          long keysOffset,
          THTensor *values,
          THLongTensor *sizes,
          THLongTensor *cumSumSizes,
          THTensor *gradOutput,
          THTensor *gradWeight,
          THTensor *gradBias,
          THTensor *weight,
          THTensor *bias,
          THTensor *valuesBuffer,
          accreal weightDecay_,
          accreal scale_)
{
  real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_);
  real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_);
  /* Retrieve all the dimensions of the problem */
  long batchSize = THLongTensor_size(sizes, 0);
  long keysSize = THLongTensor_size(keys, 0);
  long outDim = THTensor_(size)(bias, 0);
  long woutDim = THTensor_(size)(weight, 1);
  long maxNormalize = (woutDim - outDim) > 0 ?1:0;
  THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements");
  long* sizesData = THLongTensor_data(sizes);

  /* COmpute the cumulative sizes */
  THLongTensor* cumSizes = THLongTensor_new();
  THLongTensor_cumsum(cumSizes, sizes, 0);
  long* cumSizesData = THLongTensor_data(cumSizes);

  /* Resize the gradWeight buffer to keep it dense.
   * That speeds up updates A LOT assuming random mem access. */
  THTensor_(resize2d)(gradWeight, keysSize, outDim * (maxNormalize>0?2:1));

  /* Access the storage data/strides */
  real* gradOutputData = THTensor_(data)(gradOutput);
  real* valuesData =THTensor_(data)(values);
  real* gradWeightData = THTensor_(data)(gradWeight);
  real* weightData = THTensor_(data)(weight);
  real* gradBiasData = THTensor_(data)(gradBias);
  long gradWeightStride0 = gradWeight->stride[0];
  long weightStride0 = weight->stride[0];
  long* keysData = THLongTensor_data(keys);

  /* Make sure these inputs are contiguous to accelerate computations */
  THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(gradOutput), 6, "gradOutput vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(gradWeight), 7, "gradWeight must be contiguous");
  THArgCheck(THTensor_(isContiguous)(gradBias), 8, "gradBias vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(weight), 9, "weight must be contiguous");
  THArgCheck(THTensor_(isContiguous)(bias), 10, "bias vector must be contiguous");
  THArgCheck(THTensor_(isContiguous)(valuesBuffer), 11, "valuesBuffer must be contiguous");

  int i,j,k;

  /* Separate cases: output dimension is == 1, or > 1
   * This allows for some optimizations.
   * No multithreading here as this could
   * corrupt the results (hogwild style) */
  if (outDim == 1)
  {
    for (j = 0; j < batchSize; j++)
    {
      long offset = j==0?0:cumSizesData[j-1];
      real val = gradOutputData[j] * scale;
      real* lgradWeightData = gradWeightData + offset;
      real* lvaluesData = valuesData + offset;
      long end = sizesData[j];

      if (maxNormalize)
      {
        lgradWeightData += offset;
        i = 0;
        for(;i < end; i++)
        {
          lgradWeightData[2*i] = val;
          lgradWeightData[2*i+1] = val * lvaluesData[i];
        }
      }
      else
      {
        i = 0;
        for(;i < end-4; i += 4)
        {
          lgradWeightData[i] = val * lvaluesData[i];
          lgradWeightData[i+1] = val * lvaluesData[i+1];
          lgradWeightData[i+2] = val * lvaluesData[i+2];
          lgradWeightData[i+3] = val * lvaluesData[i+3];
        }

        for(; i < end; i++)
        {
          lgradWeightData[i] = val * lvaluesData[i];
        }
      }
      *gradBiasData += val;
      offset += end;
    }
  }
  else {
    for (j = 0; j < batchSize; j++)
    {
      long offset = j==0?0:cumSizesData[j-1];
      real val = 0;
      real* lgradOutputData = gradOutputData + j*outDim;
      real* lgradWeightData = gradWeightData;
      real* lweightData = weightData;
      THVector_(cadd)(gradBiasData, gradBiasData, lgradOutputData, scale, outDim);
      for (i = 0; i < sizesData[j]; i++)
      {
        real val = valuesData[offset] * scale;
        lgradWeightData = gradWeightData + offset*outDim;
        if (maxNormalize)
        {
          lgradWeightData += offset*outDim;
          k = 0;
          for(;k < outDim-4; k += 4)
          {
            lgradWeightData[k] = lgradOutputData[k]*scale;
            lgradWeightData[k+1] = lgradOutputData[k+1]*scale;
            lgradWeightData[k+2] = lgradOutputData[k+2]*scale;
            lgradWeightData[k+3] = lgradOutputData[k+3]*scale;
          }

          for(; k < outDim; k++)
          {
            lgradWeightData[k] = lgradOutputData[k]*scale;
          }
          lgradWeightData += outDim;
        }
        k = 0;
        for(;k < outDim-4; k += 4)
        {
          lgradWeightData[k] = val * lgradOutputData[k];
          lgradWeightData[k+1] = val * lgradOutputData[k+1];
          lgradWeightData[k+2] = val * lgradOutputData[k+2];
          lgradWeightData[k+3] = val * lgradOutputData[k+3];
        }

        for(; k < outDim; k++)
        {
          lgradWeightData[k] = val * lgradOutputData[k];
        }
        offset++;
      }
    }
  }
  THLongTensor_free(cumSizes);
  return;
}
#endif