#ifndef TH_GENERIC_FILE #define TH_GENERIC_FILE "generic/IndexLinear.c" #else #ifdef _OPENMP #include #endif /* Threshold used to trigger multithreading */ #ifndef THNN_SPARSE_OMP_THRESHOLD #define THNN_SPARSE_OMP_THRESHOLD 100000 #endif /* Threshold used to trigger BLAS axpy call */ #ifndef THNN_SPARSE_OUTDIM_THRESHOLD #define THNN_SPARSE_OUTDIM_THRESHOLD 49 #endif /* sign MACRO */ #ifndef THNN_INDEXLINEAR_SIGN #define THNN_INDEXLINEAR_SIGN(a) ( ( (a) < 0 ) ? -1 : ( (a) > 0 ) ) #endif static bool THNN_(checkKeysValues)(THLongTensor* keys, THTensor* values) { return THLongTensor_size(keys, 0) == THTensor_(nElement)(values) && THTensor_(nDimension)(values) == 1 && THLongTensor_nDimension(keys) == 1; } void THNN_(IndexLinear_updateOutput)( THNNState *state, THLongTensor *keys, long keysOffset, THTensor *values, THLongTensor *sizes, THLongTensor *cumSumSizes, THTensor *output, THTensor *weight, THTensor *bias, THTensor *normalizedValues, int train) { /* Retrieve all the dimensions of the problem */ long batchSize = THLongTensor_size(sizes, 0); long keysSize = THLongTensor_size(keys, 0); long outDim = THTensor_(size)(bias, 0); long woutDim = THTensor_(size)(weight, 1); int maxNormalize = woutDim - outDim; long* sizesData = THLongTensor_data(sizes); long* cumSumSizesData = THLongTensor_data(cumSumSizes); /* Define/resize the normalized values tensor if maxNormalize is > 0 */ real* normalizedValuesData = NULL; if (maxNormalize) { THTensor_(resize1d)(normalizedValues, keysSize); normalizedValuesData = THTensor_(data)(normalizedValues); } /* Resize the output */ THTensor_(resize2d)(output, batchSize, outDim); /* Access the storage data/strides */ real* outputData = THTensor_(data)(output); real* valuesData = THTensor_(data)(values); real* weightData = THTensor_(data)(weight); long weightStride0 = weight->stride[0]; real* biasData = THTensor_(data)(bias); long* keysData = THLongTensor_data(keys); /* Make sure these inputs are contiguous to accelerate computations */ THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(output), 6, "output vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(weight), 7, "weight matrix must be contiguous"); THArgCheck(THTensor_(isContiguous)(bias), 8, "bias vector must be contiguous"); THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements"); THArgCheck(THTensor_(isContiguous)(normalizedValues), 9, "normalizedValues vector must be contiguous"); long i,j,k; /* Separate cases: output dimension is == 1, or > 1 * This allows for some optimizations. */ if (outDim == 1) { THVector_(fill)(outputData, *biasData, batchSize); if (maxNormalize) { /* Parallelize on the batch itself */ #pragma omp parallel \ for private(i,j) \ firstprivate(outDim, keysOffset, \ weightData, keysData, \ valuesData, outputData, \ cumSumSizesData, sizesData) \ schedule(static) \ if(keysSize*outDim > THNN_SPARSE_OMP_THRESHOLD && batchSize > 1) for (j = 0; j < batchSize; j++) { real* loutputData = outputData + j; real val = 0; real absVal = 0; long offset = j == 0 ? 0 : cumSumSizesData[j - 1]; for (i = 0; i < sizesData[j]; i++) { long woffset = weightStride0*(keysData[offset] + keysOffset); absVal = fabs(valuesData[offset]); if (train) { if (absVal > weightData[woffset]) { weightData[woffset] = absVal; weightData[woffset+1] = 1/absVal; } /* * The following can be used to scale the size of the updates * depending on some rule, e.g. the frequency of a feature, ... * This is used at update time. * TODO: implement a smarter update scale. */ weightData[woffset+2] = 1; } normalizedValuesData[offset] = (absVal > weightData[woffset] ? THNN_INDEXLINEAR_SIGN(valuesData[offset]):valuesData[offset]*weightData[woffset+1]) + weightData[woffset+3]; val += normalizedValuesData[offset] * weightData[woffset+maxNormalize]; offset++; } *loutputData += val; } } else { /* Parallelize on the batch itself */ #pragma omp parallel \ for private(i,j) \ firstprivate(outDim, weightData, \ keysData, valuesData, \ outputData, cumSumSizesData, \ sizesData) \ schedule(static) \ if(keysSize*outDim > THNN_SPARSE_OMP_THRESHOLD && batchSize > 1) for (j = 0; j < batchSize; j++) { long offset = j == 0 ? 0 : cumSumSizesData[j - 1]; real* loutputData = outputData + j; real val = 0; for (i = 0; i < sizesData[j]; i++) { val += weightData[weightStride0*(keysData[offset] + keysOffset)] * valuesData[offset]; offset++; } *loutputData += val; } } } else { #pragma omp parallel \ for private(i,j,k) \ firstprivate(outDim, weightData, \ keysData, valuesData, \ biasData, outputData, \ cumSumSizesData, sizesData) \ schedule(static) \ if(keysSize*outDim > THNN_SPARSE_OMP_THRESHOLD && batchSize > 1) for (j = 0; j < batchSize; j++) { long offset = j == 0 ? 0 : cumSumSizesData[j - 1]; real val = 0; real* loutputData = outputData + j*outDim; real* lweightData = weightData; memcpy(loutputData, biasData, outDim*sizeof(real)); for (i = 0; i < sizesData[j]; i++) { real val; long woffset = weightStride0*(keysData[offset] + keysOffset); if (maxNormalize) { val = valuesData[offset]; real absVal = fabs(val); if (train) { if (absVal > weightData[woffset]) { weightData[woffset] = absVal; weightData[woffset+1] = 1/absVal; } /* * The following can be used to scale the size of the updates * depending on some rule, e.g. the frequency of a feature, ... * The commented section thereafter is just an example of what can be done: * *``` * weightData[woffset+2] = weightData[woffset+2]==0?1:(weightData[woffset+2] / (weightData[woffset+2] + 1)); * real alpha = 1; * real beta = 0.01; * real gamma = 1 - 0.000001; * real l = weightData[woffset+2]==0?1/gamma:(weightData[woffset+2] - beta) / (alpha - beta); * l = gamma*l; * weightData[woffset+2] = (alpha-beta)*l + beta; * ``` * * TODO: implement a smarter update scale. */ weightData[woffset+2] = 1; } /* Normalize + Clamp */ val = (absVal > weightData[woffset] ? THNN_INDEXLINEAR_SIGN(val):val*weightData[woffset+1]) + weightData[woffset+3]; normalizedValuesData[offset] = val; lweightData = weightData + woffset + maxNormalize; } else { val = valuesData[offset]; lweightData = weightData + woffset; } if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD) { THBlas_(axpy)(outDim, val, lweightData, 1, loutputData, 1); } else { for (k=0; k < outDim; k++) { loutputData[k] += lweightData[k] * val; } } offset++; } } } return; } void THNN_(IndexLinear_updateParameters)( THNNState *state, THTensor *gradWeight, THTensor *gradBias, THTensor *weight, THTensor *bias, THLongTensor *runningKeys, THLongTensor *cumSumSizes, long keysOffset, accreal weightDecay_, accreal learningRate_) { real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_); real learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_); /* Retrieve all the dimensions of the problem */ long outDim = THTensor_(size)(bias, 0); long woutDim = THTensor_(size)(weight, 1); int maxNormalize = woutDim - outDim; long keysSize = THLongTensor_size(runningKeys, 0); /* Access the storage data/strides */ real* gradWeightData = THTensor_(data)(gradWeight); real* weightData = THTensor_(data)(weight); long weightStride0 = weight->stride[0]; real* gradBiasData = THTensor_(data)(gradBias); real* biasData = THTensor_(data)(bias); long* keysData = THLongTensor_data(runningKeys); /* Make sure these inputs are contiguous to accelerate computations */ THArgCheck(THTensor_(isContiguous)(gradWeight), 1, "gradWeight must be contiguous"); THArgCheck(THTensor_(isContiguous)(gradBias), 2, "gradBias vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(weight), 3, "gradBias vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(bias), 4, "gradBias vector must be contiguous"); THArgCheck(THLongTensor_isContiguous(runningKeys), 5, "keys vector must be contiguous"); int j,k; long offset = 0; /* Update the bias first */ THVector_(cadd)(biasData, biasData, gradBiasData, -learningRate, outDim); /* Separate cases: output dimension is == 1, or > 1 * This allows for some optimizations. * No multithreading here as this could * corrupt the results (hogwild style) */ if (outDim == 1) { if (maxNormalize) { if (weightDecay) { for (j = 0; j < keysSize; j++) { long woffset = weightStride0*(keysData[j] + keysOffset) + maxNormalize; real lr = learningRate*weightData[woffset-2]; weightData[woffset-1] -= weightData[woffset]*gradWeightData[2*j]*lr; weightData[woffset] -= gradWeightData[2*j+1]*lr - weightDecay * weightData[woffset-2] * weightData[woffset]; } } else { for (j = 0; j < keysSize; j++) { long woffset = weightStride0*(keysData[j] + keysOffset) + maxNormalize; real lr = learningRate*weightData[woffset-2]; weightData[woffset-1] -= weightData[woffset]*gradWeightData[2*j]*lr; weightData[woffset] -= gradWeightData[2*j+1]*lr; } } } else { if (weightDecay) { for (j = 0; j < keysSize; j++) { long woffset = weightStride0*(keysData[j] + keysOffset); weightData[woffset] -= gradWeightData[j]*learningRate + weightDecay * weightData[woffset]; } } else { for (j = 0; j < keysSize; j++) { weightData[weightStride0*(keysData[j] + keysOffset)] -= gradWeightData[j]*learningRate; } } } } else { for (j = 0; j < keysSize; j++) { real lr = learningRate; real wd = weightDecay; real* lweightData; long woffset = weightStride0*(keysData[j] + keysOffset); real* lgradWeightData = gradWeightData + j*outDim; if (maxNormalize) { lgradWeightData += j*outDim; /* weightData[woffset + 2] */ lweightData = weightData + woffset + maxNormalize - 2; lr = lr*lweightData[0]; wd = weightDecay*lweightData[0]; /* weightData[woffset + 3] */ lweightData++; for (k=0; k < outDim; k++) { lweightData[0] -= lgradWeightData[k]*lweightData[k+1]*lr; } lweightData++; lgradWeightData += outDim; } else { lweightData = weightData + woffset; } /* We do sparse weight decay. * We think it makes more sense. */ if (weightDecay) { for (k=0; k < outDim; k++) { lweightData[k] -= lweightData[k]*wd; } } if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD) { THBlas_(axpy)(outDim, -lr, lgradWeightData, 1, lweightData, 1); } else { for (k=0; k < outDim; k++) { lweightData[k] -= lgradWeightData[k]*lr; } } } } } void THNN_(IndexLinear_accUpdateGradParameters)( THNNState *state, THLongTensor *keys, long keysOffset, THTensor *values, THLongTensor *sizes, THLongTensor *cumSumSizes, THTensor *gradOutput, THTensor *weight, THTensor *bias, accreal weightDecay_, accreal scale_) { real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_); real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); /* Retrieve all the dimensions of the problem */ long batchSize = THLongTensor_size(sizes, 0); long keysSize = THLongTensor_size(keys, 0); long outDim = THTensor_(size)(bias, 0); long woutDim = THTensor_(size)(weight, 1); int maxNormalize = woutDim - outDim; THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements"); /* Access the storage data/strides */ real* gradOutputData = THTensor_(data)(gradOutput); real* valuesData =THTensor_(data)(values); real* weightData = THTensor_(data)(weight); real* biasData = THTensor_(data)(bias); long weightStride0 = weight->stride[0]; long biasStride = bias->stride[0]; long* keysData = THLongTensor_data(keys); long* sizesData = THLongTensor_data(sizes); /* Make sure these inputs are contiguous to accelerate computations */ THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(gradOutput), 6, "gradOutput vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(weight), 7, "weight matrix must be contiguous"); THArgCheck(THTensor_(isContiguous)(bias), 8, "bias matrix must be contiguous"); int i,j,k; /* Separate cases: output dimension is == 1, or > 1 * This allows for some optimizations. * No multithreading here as this could * corrupt the results (hogwild style) */ if (outDim == 1) { if (maxNormalize) { long offset = 0; for (j = 0; j < batchSize; j++) { real* lgradOutputData = gradOutputData + j; *biasData -= *lgradOutputData * scale; real val = *lgradOutputData * scale; real* lweightData = weightData; for (i = 0; i < sizesData[j]; i++) { long idx = weightStride0*(keysData[offset] + keysOffset) + maxNormalize; weightData[idx-1] -= weightData[idx]*val*weightData[idx-2]; weightData[idx] -= (val*valuesData[offset] - weightDecay * weightData[idx])*weightData[idx-2]; offset++; } } offset = 0; for (j = 0; j < batchSize; j++) { real* lweightData = weightData; for (i = 0; i < sizesData[j]; i++) { long idx = weightStride0*(keysData[offset] + keysOffset) + maxNormalize; weightData[idx-2] = 0; offset++; } } } else { if (weightDecay) { long offset = 0; for (j = 0; j < batchSize; j++) { real* lgradOutputData = gradOutputData + j; *biasData -= *lgradOutputData * scale; real val = *lgradOutputData * scale; real* lweightData = weightData; for (i = 0; i < sizesData[j]; i++) { long idx = weightStride0*(keysData[offset] + keysOffset); weightData[idx] -= val * valuesData[offset] + weightData[idx] * weightDecay; offset++; } } } else { long offset = 0; for (j = 0; j < batchSize; j++) { real val = gradOutputData[j] * scale; for (i = 0; i < sizesData[j]; i++) { weightData[(keysData[offset] + keysOffset)*weightStride0] -= val * valuesData[offset]; offset++; } *biasData -= val; } } } } else { long offset = 0; for (j = 0; j < batchSize; j++) { real val = 0; real* lgradOutputData = gradOutputData + j*outDim; real* lweightData = weightData; THVector_(cadd)(biasData, biasData, lgradOutputData, -scale, outDim); for (i = 0; i < sizesData[j]; i++) { real val = valuesData[offset] * scale; real wd = weightDecay; // Max normalize case if (maxNormalize) { lweightData = weightData + weightStride0*(keysData[offset] + keysOffset) + (maxNormalize-2); val *= lweightData[0]; wd *= lweightData[0]; for (k=0; k < outDim; k++) { lweightData[1] -= lweightData[k+2]*scale*lgradOutputData[k]*lweightData[0]; } lweightData += 2; } else { lweightData = weightData + weightStride0*(keysData[offset] + keysOffset); } /* We do sparse weight decay. * We think it makes more sense. */ if (weightDecay) { if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD) { THBlas_(axpy)(outDim, -wd, lweightData, 1, lweightData, 1); } else { for (k=0; k < outDim; k++) { lweightData[k] -= wd * lweightData[k]; } } } if (outDim > THNN_SPARSE_OUTDIM_THRESHOLD) { THBlas_(axpy)(outDim, -val, lgradOutputData, 1, lweightData, 1); } else { for (k=0; k < outDim; k++) { lweightData[k] -= val * lgradOutputData[k]; } } offset++; } } /* Max Normalize case: * Reset the smart update scaling if * one does it batch-wise. * TODO: Decide what to do with that piece of code. * NB: If the code belowe is uncommented, so should the commented * code in IndexLinear:zeroGradParameters() */ /* if (maxNormalize) { offset = 0; for (j = 0; j < batchSize; j++) { real* lweightData = weightData; for (i = 0; i < sizesData[j]; i++) { real val = valuesData[offset] * scale; real wd = weightDecay; lweightData = weightData + weightStride0*(keysData[offset] + keysOffset) + (maxNormalize-2); lweightData[0] = 0; offset++; } } } */ } return; } void THNN_(IndexLinear_accGradParameters)( THNNState *state, THLongTensor *keys, long keysOffset, THTensor *values, THLongTensor *sizes, THLongTensor *cumSumSizes, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *weight, THTensor *bias, THTensor *valuesBuffer, accreal weightDecay_, accreal scale_) { real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_); real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); /* Retrieve all the dimensions of the problem */ long batchSize = THLongTensor_size(sizes, 0); long keysSize = THLongTensor_size(keys, 0); long outDim = THTensor_(size)(bias, 0); long woutDim = THTensor_(size)(weight, 1); long maxNormalize = (woutDim - outDim) > 0 ?1:0; THArgCheck(THNN_(checkKeysValues)(keys, values), 1, "Keys and values should have the same number of elements"); long* sizesData = THLongTensor_data(sizes); /* COmpute the cumulative sizes */ THLongTensor* cumSizes = THLongTensor_new(); THLongTensor_cumsum(cumSizes, sizes, 0); long* cumSizesData = THLongTensor_data(cumSizes); /* Resize the gradWeight buffer to keep it dense. * That speeds up updates A LOT assuming random mem access. */ THTensor_(resize2d)(gradWeight, keysSize, outDim * (maxNormalize>0?2:1)); /* Access the storage data/strides */ real* gradOutputData = THTensor_(data)(gradOutput); real* valuesData =THTensor_(data)(values); real* gradWeightData = THTensor_(data)(gradWeight); real* weightData = THTensor_(data)(weight); real* gradBiasData = THTensor_(data)(gradBias); long gradWeightStride0 = gradWeight->stride[0]; long weightStride0 = weight->stride[0]; long* keysData = THLongTensor_data(keys); /* Make sure these inputs are contiguous to accelerate computations */ THArgCheck(THLongTensor_isContiguous(keys), 1, "keys vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(values), 3, "values vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(gradOutput), 6, "gradOutput vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(gradWeight), 7, "gradWeight must be contiguous"); THArgCheck(THTensor_(isContiguous)(gradBias), 8, "gradBias vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(weight), 9, "weight must be contiguous"); THArgCheck(THTensor_(isContiguous)(bias), 10, "bias vector must be contiguous"); THArgCheck(THTensor_(isContiguous)(valuesBuffer), 11, "valuesBuffer must be contiguous"); int i,j,k; /* Separate cases: output dimension is == 1, or > 1 * This allows for some optimizations. * No multithreading here as this could * corrupt the results (hogwild style) */ if (outDim == 1) { for (j = 0; j < batchSize; j++) { long offset = j==0?0:cumSizesData[j-1]; real val = gradOutputData[j] * scale; real* lgradWeightData = gradWeightData + offset; real* lvaluesData = valuesData + offset; long end = sizesData[j]; if (maxNormalize) { lgradWeightData += offset; i = 0; for(;i < end; i++) { lgradWeightData[2*i] = val; lgradWeightData[2*i+1] = val * lvaluesData[i]; } } else { i = 0; for(;i < end-4; i += 4) { lgradWeightData[i] = val * lvaluesData[i]; lgradWeightData[i+1] = val * lvaluesData[i+1]; lgradWeightData[i+2] = val * lvaluesData[i+2]; lgradWeightData[i+3] = val * lvaluesData[i+3]; } for(; i < end; i++) { lgradWeightData[i] = val * lvaluesData[i]; } } *gradBiasData += val; offset += end; } } else { for (j = 0; j < batchSize; j++) { long offset = j==0?0:cumSizesData[j-1]; real val = 0; real* lgradOutputData = gradOutputData + j*outDim; real* lgradWeightData = gradWeightData; real* lweightData = weightData; THVector_(cadd)(gradBiasData, gradBiasData, lgradOutputData, scale, outDim); for (i = 0; i < sizesData[j]; i++) { real val = valuesData[offset] * scale; lgradWeightData = gradWeightData + offset*outDim; if (maxNormalize) { lgradWeightData += offset*outDim; k = 0; for(;k < outDim-4; k += 4) { lgradWeightData[k] = lgradOutputData[k]*scale; lgradWeightData[k+1] = lgradOutputData[k+1]*scale; lgradWeightData[k+2] = lgradOutputData[k+2]*scale; lgradWeightData[k+3] = lgradOutputData[k+3]*scale; } for(; k < outDim; k++) { lgradWeightData[k] = lgradOutputData[k]*scale; } lgradWeightData += outDim; } k = 0; for(;k < outDim-4; k += 4) { lgradWeightData[k] = val * lgradOutputData[k]; lgradWeightData[k+1] = val * lgradOutputData[k+1]; lgradWeightData[k+2] = val * lgradOutputData[k+2]; lgradWeightData[k+3] = val * lgradOutputData[k+3]; } for(; k < outDim; k++) { lgradWeightData[k] = val * lgradOutputData[k]; } offset++; } } } THLongTensor_free(cumSizes); return; } #endif