You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

GBDT_common.h 3.7KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106
  1. #include "khash.h"
  2. #include <pthread.h>
  3. #define computeGradientBoostLoss(g, h) (-(g)*(g)/(h))
  4. // we use khash to make iteration faster than lua tables
  5. KHASH_SET_INIT_INT64(long)
  6. // defines the data we need for running an instance of thet and its constructor/destructor
  7. typedef struct {
  8. khash_t(long)* exampleMap;
  9. THLongTensor *exampleIdsWithFeature_cache;
  10. long minLeafSize;
  11. } GBRunData;
  12. // allocates data that cannot be shared between threads
  13. static void gb_local_create_run_data(GBRunData *run_data) {
  14. run_data->exampleMap = kh_init(long);
  15. run_data->exampleIdsWithFeature_cache = THLongTensor_new();
  16. }
  17. static void gb_create_run_data(GBRunData *run_data, int minLeafSize) {
  18. gb_local_create_run_data(run_data);
  19. run_data->minLeafSize = minLeafSize;
  20. }
  21. static void gb_destroy_run_data(GBRunData *run_data) {
  22. THLongTensor_free(run_data->exampleIdsWithFeature_cache);
  23. kh_destroy(long, run_data->exampleMap);
  24. }
  25. // initializes the data required by the optimizer for the given feature.
  26. static THLongTensor *gb_internal_prepare(lua_State *L, THLongTensor *exampleIds,
  27. THLongTensor *exampleIdsWithFeature_cache, int input_index, long feature_id,
  28. khash_t(long)* exampleMap) {
  29. long *exampleIds_data = THLongTensor_data(exampleIds);
  30. long exampleIds_size = THLongTensor_size(exampleIds, 0);
  31. int ret = 0;
  32. // if the the input is a table, then we have a sparse dataset
  33. if (lua_istable(L, input_index)) {
  34. if (exampleIds_size == 0) {
  35. return NULL;
  36. }
  37. else {
  38. // loops over the examples' ids that this node has to evaluate and, if they have the feature
  39. // we're looking for, marks them as present and stores them in the order provided by the
  40. // dataset
  41. THLongTensor_resize1d(exampleIdsWithFeature_cache, exampleIds_size);
  42. kh_clear(long, exampleMap);
  43. kh_resize(long, exampleMap, exampleIds_size*8);
  44. long *exampleIdsWithFeature_data = THLongTensor_data(exampleIdsWithFeature_cache);
  45. long j = 0;
  46. // for each sample to be evaluated
  47. for (long i = 0; i < exampleIds_size; i++) {
  48. // gets the representation for the example
  49. lua_pushinteger(L, exampleIds_data[i]);
  50. lua_gettable(L, input_index);
  51. // builds the index, which happens only once per thread for efficiency
  52. lua_pushstring(L, "buildIndex");
  53. lua_gettable(L, -2);
  54. lua_pushvalue(L, -2);
  55. lua_call(L, 1, 0);
  56. // tries to get the feature for this sample
  57. lua_pushinteger(L, feature_id);
  58. lua_gettable(L, -2);
  59. // if present, then...
  60. if (!lua_isnil(L, -1)) {
  61. // saves the example
  62. exampleIdsWithFeature_data[j] = exampleIds_data[i];
  63. j++;
  64. // marks it as present in the hash table
  65. kh_put(long, exampleMap, exampleIds_data[i], &ret);
  66. }
  67. lua_pop(L, 2);
  68. }
  69. // resizes to fit only the samples that have the feature
  70. THLongTensor_resize1d(exampleIdsWithFeature_cache, j);
  71. kh_resize(long, exampleMap, j*8);
  72. return exampleIdsWithFeature_cache;
  73. }
  74. }
  75. else {
  76. // if the input isn't a table, then it's dense and we cannot have exampleIds missing, so it
  77. // depends on feature_id
  78. // since exampleIds is fixed between calls and this is going to store the same values to the
  79. // same position, we can cache it between calls
  80. if (kh_size(exampleMap) == 0) {
  81. kh_resize(long, exampleMap, exampleIds_size*8);
  82. for (long i = 0; i < exampleIds_size; i++) {
  83. kh_put(long, exampleMap, exampleIds_data[i], &ret);
  84. }
  85. }
  86. // notice that we just return the given tensor of ids instead of copying it. the rest of the
  87. // code handles this transparently
  88. return exampleIds;
  89. }
  90. }