#ifndef TH_GENERIC_FILE #define TH_GENERIC_FILE "generic/SparseLinear.c" #else #ifdef _OPENMP #include #endif #define ROW_PTR2(t, r) (THTensor_(data)(t) + (r) * (t)->stride[0]) #define COL_PTR2(t, c) (THTensor_(data)(t) + (c) * (t)->stride[1]) static bool THNN_(checkLegacyInput)(THTensor* t) { return t->nDimension == 3 && t->size[2] == 2; } static bool THNN_(checkInput)(THTensor* t) { return t->nDimension == 2 && t->size[1] == 3; } static bool THNN_(checkSize2D)(THTensor* t, long size0, long size1) { return t->nDimension == 2 && t->size[0] == size0 && t->size[1] == size1; } static bool THNN_(checkSize1D)(THTensor* t, long size0) { return t->nDimension == 1 && t->size[0] == size0; } static void THNN_(set1d)(THTensor *t, long x0, real value) { THStorage_(set)(t->storage, t->storageOffset + x0*t->stride[0], value); } static real THNN_(get3d)(const THTensor *t, long x0, long x1, long x2) { return THStorage_(get)(t->storage, t->storageOffset + x0*t->stride[0] + x1*t->stride[1] + x2*t->stride[2]); } static real THNN_(get2d)(const THTensor *t, long x0, long x1) { return THStorage_(get)(t->storage, t->storageOffset + x0*t->stride[0] + x1*t->stride[1]); } void THNN_(SparseLinear_updateOutput)( THNNState *state, THTensor *input, THTensor *output, THTensor *weight, THTensor *bias) { long h, i, j, hp0, hp1; long outDim = THTensor_(size)(weight, 0); long inDim = THTensor_(size)(weight, 1); long batchSize = THTensor_(size)(output, 0); THArgCheck(THNN_(checkInput)(input), 2, "input must be in coo format, nnz x 3"); THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous"); THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong"); long nnz = THTensor_(size)(input, 0); THLongTensor * csr = THLongTensor_newWithSize1d(batchSize+1); THLongTensor_zero(csr); weight = THTensor_(newContiguous)(weight); //#pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000) for (i=0; i 10000) for (h = 0; h < batchSize; h++) { long i_start = THLongTensor_get1d(csr, h); long i_end = THLongTensor_get1d(csr, h+1); for (i = i_start; i < i_end; i++) { real val = THNN_(get2d)(input, i, 2); if (val == 0) { continue; } long offset = (long)(THNN_(get2d)(input, i, 1)) - 1; if (offset >= 0 && offset < inDim) { THBlas_(axpy)(outDim, val, COL_PTR2(weight, offset), weight->stride[0], ROW_PTR2(output, h), output->stride[1]); } else { THError("index out of bound. updateOutput: %d not between 1 and %d", offset + 1, inDim); } } } THTensor* output_row = THTensor_(new)(); for (h = 0; h < batchSize; h++) { THTensor_(select)(output_row, output, 0, h); THTensor_(cadd)(output_row, bias, 1.0, output_row); } THTensor_(free)(output_row); THLongTensor_free(csr); THTensor_(free)(weight); } void THNN_(SparseLinear_legacyUpdateOutput)( THNNState *state, THTensor *input, THTensor *output, THTensor *weight, THTensor *bias) { long h, i; long outDim = THTensor_(size)(weight, 0); long inDim = THTensor_(size)(weight, 1); THArgCheck(THNN_(checkLegacyInput)(input), 2, "input size must be batchsize x nnz x 2"); THArgCheck(THTensor_(isContiguous)(output), 3, "output must be contiguous"); THArgCheck(THNN_(checkSize1D)(bias, outDim), 5, "bias size wrong"); weight = THTensor_(newContiguous)(weight); long batchSize = THTensor_(size)(input, 0); long nnz = THTensor_(size)(input, 1); THTensor_(resize2d)(output, batchSize, outDim); // output = weight * input + bias THTensor_(zero)(output); #pragma omp parallel for private(h, i) schedule(static) if ( \ batchSize > 1 && batchSize * nnz * outDim > 10000) for (h = 0; h < batchSize; h++) { for (i = 0; i < nnz; i++) { real val = THNN_(get3d)(input, h, i, 1); if (val == 0) { continue; } long offset = (long)(THNN_(get3d)(input, h, i, 0)) - 1; if (offset >= 0 && offset < inDim) { THBlas_(axpy)(outDim, val, COL_PTR2(weight, offset), weight->stride[0], ROW_PTR2(output, h), output->stride[1]); } else { THError("index out of bound. updateOutput: %d not between 1 and %d", offset + 1, inDim); } } } THTensor* output_row = THTensor_(new)(); for (h = 0; h < batchSize; h++) { THTensor_(select)(output_row, output, 0, h); THTensor_(cadd)(output_row, bias, 1.0, output_row); } THTensor_(free)(output_row); THTensor_(free)(weight); } void THNN_(SparseLinear_accGradParameters)( THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *weight, THTensor *bias, accreal weightDecay_, accreal scale_) { real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_); real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); long h, i, col, hp0, hp1; long outDim = THTensor_(size)(weight, 0); long inDim = THTensor_(size)(weight, 1); THArgCheck(THNN_(checkInput)(input), 2, "input must be in coo format, nnz x 3"); THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4, "gradWeight size wrong"); THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong"); THArgCheck(THTensor_(isContiguous)(gradOutput), 1, "gradOutput must be contiguous"); long nnz = THTensor_(size)(input, 0); THLongTensor* csc = THLongTensor_newWithSize1d(inDim+1); THLongTensor_zero(csc); weight = THTensor_(newContiguous)(weight); #pragma omp parallel for private(i, h, hp0, hp1) schedule(static) if (nnz > 10000) for (i = 0; i < nnz; i++) { hp0 = (long)(THNN_(get2d)(input, i, 1)) - 1; hp1 = (i+1 == nnz) ? inDim : (long)(THNN_(get2d)(input, i+1, 1)) - 1; if (hp0 != hp1) for (h = hp0; h < hp1; h++) { THLongTensor_set1d(csc, h+1, i+1); } } // gradWeight += gradOutput * input #pragma omp parallel for private(h, i, col) schedule(static) if (nnz > 10000) for (col = 0; col < inDim; col++) { long i_start = THLongTensor_get1d(csc, col); long i_end = THLongTensor_get1d(csc, col+1); for (i = i_start; i < i_end; i++) { real val = scale * THNN_(get2d)(input, i, 2); h = (long)(THNN_(get2d)(input, i, 0)) - 1; long offset = (long)(THNN_(get2d)(input, i, 1)) - 1; if (offset >= 0 && offset < inDim) { THBlas_(axpy)(outDim, val, ROW_PTR2(gradOutput, h), gradOutput->stride[1], COL_PTR2(gradWeight, offset), gradWeight->stride[0]); } else { THError( "index out of bound. accGradParameters: %d not between 1 and %d", offset + 1, inDim); } } } // gradBias += gradOutput THTensor* buf = THTensor_(new)(); THTensor_(sum)(buf, gradOutput, 0, 1); THTensor_(cadd)(gradBias, gradBias, scale, buf); THTensor_(free)(buf); THLongTensor_free(csc); if (weightDecay != 0) { THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight); } THTensor_(free)(weight); } void THNN_(SparseLinear_legacyAccGradParameters)( THNNState *state, THTensor *input, THTensor *gradOutput, THTensor *gradWeight, THTensor *gradBias, THTensor *weight, THTensor *bias, accreal weightDecay_, accreal scale_) { real weightDecay = TH_CONVERT_ACCREAL_TO_REAL(weightDecay_); real scale = TH_CONVERT_ACCREAL_TO_REAL(scale_); long h, i; long outDim = THTensor_(size)(weight, 0); long inDim = THTensor_(size)(weight, 1); THArgCheck(THNN_(checkLegacyInput)(input), 2, "input size must be batchsize x nnz x 2"); THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4, "gradWeight size wrong"); THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong"); THArgCheck(THTensor_(isContiguous)(gradOutput), 1, "gradOutput must be contiguous"); long batchSize = THTensor_(size)(input, 0); long nnz = THTensor_(size)(input, 1); THTensor_(resize2d)(gradOutput, batchSize, outDim); // gradWeight += gradOutput * input #pragma omp parallel for private(h, i) schedule(static) if (\ batchSize * nnz * outDim > 10000) for (i = 0; i < nnz; i++) { for (h = 0; h < batchSize; h++) { real val = scale * THNN_(get3d)(input, h, i, 1); if (val == 0) { continue; } long offset = (long)(THNN_(get3d)(input, h, i, 0)) - 1; if (offset >= 0 && offset < inDim) { THBlas_(axpy)(outDim, val, ROW_PTR2(gradOutput, h), gradOutput->stride[1], COL_PTR2(gradWeight, offset), gradWeight->stride[0]); } else { THError( "index out of bound. accGradParameters: %d not between 1 and %d", offset + 1, inDim); } } } // gradBias += gradOutput THTensor* gradOutput_row = THTensor_(new)(); for (h = 0; h < batchSize; h++) { THTensor_(select)(gradOutput_row, gradOutput, 0, h); THTensor_(cadd)(gradBias, gradBias, scale, gradOutput_row); } THTensor_(free)(gradOutput_row); if (weightDecay != 0) { THTensor_(cadd)(gradWeight, gradWeight, weightDecay, weight); } } void THNN_(SparseLinear_updateParameters)( THNNState *state, THTensor *weight, THTensor *bias, THTensor *gradWeight, THTensor *gradBias, THTensor *lastInput, accreal learningRate_) { real learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_); long h, i; long outDim = weight->size[0]; long inDim = weight->size[1]; THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4, "gradWeight size wrong"); THArgCheck(THNN_(checkSize1D)(bias, outDim), 3, "bias size wrong"); THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong"); THArgCheck(THNN_(checkInput)(lastInput), 6, "input must be in coo format, nnz x 3"); long nnz = THTensor_(size)(lastInput, 0); // collect unique offsets of non-0 val in input THTensor* offsets = THTensor_(newWithSize1d)(nnz); long cnt = 0; for (i = 0; i < nnz; i++) { real val = THNN_(get2d)(lastInput, i, 2); if (val == 0) { continue; } long offset = (long)(THNN_(get2d)(lastInput, i, 1)) - 1; if (offset >= 0 && offset < inDim) { THNN_(set1d)(offsets, cnt++, offset); } else { THError( "index out of bound. updateParameters: %d not between 1 and %d", offset + 1, inDim); } } if (cnt == 0) return; THTensor_(resize1d)(offsets, cnt); THTensor* uniqueOffsets = THTensor_(new)(); THLongTensor* ri = THLongTensor_new(); THTensor_(sort)(uniqueOffsets, ri, offsets, 0, 0); THLongTensor_free(ri); THTensor_(free)(offsets); cnt = 1; real* uniqueOffsets_p = THTensor_(data)(uniqueOffsets); for (i = 1; i < THTensor_(size)(uniqueOffsets, 0); i++) { if (uniqueOffsets_p[i] != uniqueOffsets_p[i - 1]) { uniqueOffsets_p[cnt++] = uniqueOffsets_p[i]; } } THTensor_(resize1d)(uniqueOffsets, cnt); // weight += -learningRate * gradWeight THTensor_(cadd)(bias, bias, -learningRate, gradBias); #pragma omp parallel for private(i) schedule(static) if (cnt * outDim > 10000) for (i = 0; i < cnt; i++) { long offset = (long)uniqueOffsets_p[i]; THBlas_(axpy)(outDim, -learningRate, COL_PTR2(gradWeight, offset), gradWeight->stride[0], COL_PTR2(weight, offset), weight->stride[0]); } THTensor_(free)(uniqueOffsets); } void THNN_(SparseLinear_legacyUpdateParameters)( THNNState *state, THTensor *weight, THTensor *bias, THTensor *gradWeight, THTensor *gradBias, THTensor *lastInput, accreal learningRate_) { real learningRate = TH_CONVERT_ACCREAL_TO_REAL(learningRate_); long h, i; long outDim = weight->size[0]; long inDim = weight->size[1]; THArgCheck(THNN_(checkSize2D)(gradWeight, outDim, inDim), 4, "gradWeight size wrong"); THArgCheck(THNN_(checkSize1D)(bias, outDim), 3, "bias size wrong"); THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 5, "gradBias size wrong"); THArgCheck(THNN_(checkLegacyInput)(lastInput), 6, "input size must be batchsize x nnz x 2"); long batchSize = THTensor_(size)(lastInput, 0); long nnz = THTensor_(size)(lastInput, 1); // collect unique offsets of non-0 val in input THTensor* offsets = THTensor_(newWithSize1d)(batchSize * nnz); long cnt = 0; for (h = 0; h < batchSize; h++) { for (i = 0; i < nnz; i++) { real val = THNN_(get3d)(lastInput, h, i, 1); if (val == 0 ) { continue; } long offset = (long)(THNN_(get3d)(lastInput, h, i, 0)) - 1; if (offset >= 0 && offset < inDim) { THNN_(set1d)(offsets, cnt++, offset); } else { THError( "index out of bound. updateParameters: %d not between 1 and %d", offset + 1, inDim); } } } THTensor_(resize1d)(offsets, cnt); THTensor* uniqueOffsets = THTensor_(new)(); THLongTensor* ri = THLongTensor_new(); THTensor_(sort)(uniqueOffsets, ri, offsets, 0, 0); THLongTensor_free(ri); THTensor_(free)(offsets); cnt = 1; real* uniqueOffsets_p = THTensor_(data)(uniqueOffsets); for (i = 1; i < THTensor_(size)(uniqueOffsets, 0); i++) { if (uniqueOffsets_p[i] != uniqueOffsets_p[i - 1]) { uniqueOffsets_p[cnt++] = uniqueOffsets_p[i]; } } THTensor_(resize1d)(uniqueOffsets, cnt); // weight += -learningRate * gradWeight THTensor_(cadd)(bias, bias, -learningRate, gradBias); #pragma omp parallel for private(i) schedule(static) if (cnt * outDim > 10000) for (i = 0; i < cnt; i++) { long offset = (long)uniqueOffsets_p[i]; THBlas_(axpy)(outDim, -learningRate, COL_PTR2(gradWeight, offset), gradWeight->stride[0], COL_PTR2(weight, offset), weight->stride[0]); } THTensor_(free)(uniqueOffsets); } void THNN_(SparseLinear_zeroGradParameters)( THNNState *state, THTensor *gradWeight, THTensor *gradBias, THTensor *lastInput) { long h, i, j; long outDim = gradWeight->size[0]; long inDim = gradWeight->size[1]; THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 3, "gradBias size wrong"); THArgCheck(THNN_(checkInput)(lastInput), 4, "input must be in coo format, nnz x 3"); THTensor_(zero)(gradBias); long nnz = THTensor_(size)(lastInput, 0); #pragma omp parallel for private(i, j) schedule(static) if ( \ nnz * outDim > 10000) for (i = 0; i < nnz; i++) { if (THNN_(get2d)(lastInput, i, 2) == 0 ) { continue; } long offset = (long)(THNN_(get2d)(lastInput, i, 1)) - 1; if (offset >= 0 && offset < inDim) { real* pGradWeight = COL_PTR2(gradWeight, offset); if (gradWeight->stride[0] == 1) { THVector_(fill)(pGradWeight, 0, outDim); } else { long stride = gradWeight->stride[0]; for (j = 0; j < outDim; ++j) { pGradWeight[j * stride] = 0; } } } else { THError( "index out of bound. zeroGradParameters: %d not between 1 and %d", offset + 1, inDim); } } } void THNN_(SparseLinear_legacyZeroGradParameters)( THNNState *state, THTensor *gradWeight, THTensor *gradBias, THTensor *lastInput) { long h, i, j; long outDim = gradWeight->size[0]; long inDim = gradWeight->size[1]; THArgCheck(THNN_(checkSize1D)(gradBias, outDim), 3, "gradBias size wrong"); THArgCheck(THNN_(checkLegacyInput)(lastInput), 4, "input size must be batchsize x nnz x 2"); THTensor_(zero)(gradBias); long batchSize = THTensor_(size)(lastInput, 0); long nnz = THTensor_(size)(lastInput, 1); #pragma omp parallel for private(h, i, j) schedule(static) if ( \ batchSize > 1 && batchSize * nnz * outDim > 10000) for (h = 0; h < batchSize; h++) { for (i = 0; i < nnz; i++) { if (THNN_(get3d)(lastInput, h, i, 1) == 0 ) { continue; } long offset = (long)(THNN_(get3d)(lastInput, h, i, 0)) - 1; if (offset >= 0 && offset < inDim) { real* pGradWeight = COL_PTR2(gradWeight, offset); if (gradWeight->stride[0] == 1) { THVector_(fill)(pGradWeight, 0, outDim); } else { long stride = gradWeight->stride[0]; for (j = 0; j < outDim; ++j) { pGradWeight[j * stride] = 0; } } } else { THError( "index out of bound. zeroGradParameters: %d not between 1 and %d", offset + 1, inDim); } } } } #undef ROW_PTR2 #undef COL_PTR2 #endif