Introduction

“Gradient boosting is one of the most successful machine learning algorithms for nonparametric regression and classification. Boosting adaptively combines a large number of relatively simple prediction models called base learners into an ensemble learner to achieve high prediction performance.”

“Its advantages are threefold. First, the model structure of TDboost is learned from data and not predetermined, thereby avoiding an explicit model specification. Non-linearities, discontinuities, complex and higher order interactions are naturally incorporated into the model to reduce the potential modeling bias and to produce high predictive performance, which enables TDboost to serve as a benchmark model in scoring insurance policies, guiding pricing practice, and facilitating marketing efforts. Feature selection is performed as an integral part of the procedure. In addition, TDboost handles the predictor and response variables of any type without the need for transformation, and it is highly robust to outliers. Missing values in the predictors are managed almost without loss of information.”

TDboost package notes

A boosted Tweedie compound Poisson model using the gradient boosting. It is capable of fitting a flexible nonlinear Tweedie compound Poisson model (or a gamma model) and capturing interactions among predictors.

Insurance Premium Prediction via Gradient Tree-Boosted Tweedie Compound Poisson Models (R package authors pre-print)

CRAN: https://cran.r-project.org/web/packages/TDboost/TDboost.pdf

load the demo data

library(TDboost)
library(HDtweedie) # has example dataset
data("auto")

create a test and train subset

library(dplyr)

auto2 = tbl_df(as.data.frame(auto))
# create a split based on the outcome of y which preserves the response distribution
# http://topepo.github.io/caret/splitting.html
library(caret)
set.seed(3456)
trainIndex <- createDataPartition(auto2$y, p = .66,
                                  list = FALSE,
                                  times = 1)
head(trainIndex)
##      Resample1
## [1,]         1
## [2,]         2
## [3,]         3
## [4,]         4
## [5,]         6
## [6,]         7
train_auto <- auto2[trainIndex, ]
dim(train_auto)
## [1] 1857   57
test_auto <- auto2[-trainIndex, ]
dim(test_auto)
## [1] 955  57

data transformations for TDboost

train_auto <- as.data.frame(train_auto)
test_auto <- as.data.frame(test_auto)

model training

fit <- TDboost(y ~. , data=train_auto, cv.folds=5, n.trees=300, interaction.depth = 20)
## CV: 1 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        8.1324          8.5538     0.0010    0.0016
##      2        8.1309          8.5548     0.0010   -0.0006
##      3        8.1288          8.5532     0.0010    0.0014
##      4        8.1278          8.5517     0.0010   -0.0013
##      5        8.1260          8.5508     0.0010    0.0010
##      6        8.1243          8.5514     0.0010    0.0001
##      7        8.1223          8.5500     0.0010    0.0013
##      8        8.1203          8.5486     0.0010    0.0011
##      9        8.1187          8.5473     0.0010    0.0009
##     10        8.1175          8.5482     0.0010   -0.0015
##    100        7.9431          8.4607     0.0010   -0.0025
##    200        7.7854          8.4049     0.0010    0.0010
##    300        7.6470          8.3557     0.0010    0.0007
## 
## CV: 2 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        8.2351          8.1419     0.0010    0.0017
##      2        8.2327          8.1405     0.0010    0.0017
##      3        8.2309          8.1400     0.0010    0.0010
##      4        8.2284          8.1390     0.0010    0.0019
##      5        8.2261          8.1382     0.0010    0.0016
##      6        8.2239          8.1368     0.0010    0.0014
##      7        8.2220          8.1374     0.0010    0.0007
##      8        8.2195          8.1378     0.0010    0.0010
##      9        8.2175          8.1370     0.0010    0.0012
##     10        8.2155          8.1359     0.0010    0.0014
##    100        8.0334          8.1102     0.0010    0.0011
##    200        7.8673          8.1166     0.0010    0.0010
##    300        7.7306          8.1709     0.0010   -0.0013
## 
## CV: 3 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        8.2650          8.0270     0.0010   -0.0016
##      2        8.2631          8.0255     0.0010    0.0013
##      3        8.2608          8.0240     0.0010    0.0020
##      4        8.2587          8.0225     0.0010    0.0016
##      5        8.2567          8.0211     0.0010    0.0006
##      6        8.2543          8.0210     0.0010    0.0015
##      7        8.2519          8.0211     0.0010    0.0011
##      8        8.2499          8.0202     0.0010    0.0004
##      9        8.2477          8.0182     0.0010    0.0013
##     10        8.2458          8.0168     0.0010    0.0004
##    100        8.0828          7.9237     0.0010    0.0000
##    200        7.9223          7.8581     0.0010    0.0009
##    300        7.7881          7.8157     0.0010    0.0004
## 
## CV: 4 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        8.1980          8.2908     0.0010    0.0016
##      2        8.1957          8.2889     0.0010    0.0016
##      3        8.1937          8.2885     0.0010    0.0006
##      4        8.1915          8.2868     0.0010    0.0017
##      5        8.1878          8.2837     0.0010    0.0031
##      6        8.1856          8.2819     0.0010    0.0016
##      7        8.1853          8.2810     0.0010   -0.0030
##      8        8.1846          8.2811     0.0010   -0.0015
##      9        8.1821          8.2793     0.0010    0.0019
##     10        8.1800          8.2789     0.0010    0.0006
##    100        8.0161          8.1743     0.0010   -0.0004
##    200        7.8609          8.0890     0.0010    0.0003
##    300        7.7263          8.0382     0.0010    0.0008
## 
## CV: 5 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        8.2512          8.0774     0.0010    0.0012
##      2        8.2491          8.0756     0.0010    0.0014
##      3        8.2469          8.0740     0.0010    0.0016
##      4        8.2447          8.0723     0.0010    0.0016
##      5        8.2430          8.0729     0.0010   -0.0005
##      6        8.2410          8.0717     0.0010    0.0012
##      7        8.2383          8.0699     0.0010    0.0025
##      8        8.2358          8.0682     0.0010    0.0016
##      9        8.2337          8.0682     0.0010    0.0008
##     10        8.2315          8.0666     0.0010    0.0014
##    100        8.0548          7.9619     0.0010    0.0011
##    200        7.8894          7.8852     0.0010    0.0009
##    300        7.7527          7.8404     0.0010    0.0009
## 
## Iter   TrainDeviance   ValidDeviance   StepSize   Improve
##      1        8.2160             nan     0.0010    0.0018
##      2        8.2140             nan     0.0010    0.0013
##      3        8.2123             nan     0.0010    0.0009
##      4        8.2106             nan     0.0010    0.0012
##      5        8.2088             nan     0.0010    0.0013
##      6        8.2067             nan     0.0010    0.0002
##      7        8.2047             nan     0.0010    0.0013
##      8        8.2029             nan     0.0010    0.0011
##      9        8.2028             nan     0.0010   -0.0032
##     10        8.2006             nan     0.0010    0.0010
##    100        8.0386             nan     0.0010    0.0006
##    200        7.8848             nan     0.0010    0.0010
##    300        7.7598             nan     0.0010   -0.0007

check performance using 5-fold cross-validation

best.iter <- TDboost.perf(fit,method="cv")

plot the performance and variable influence

# plot the performance
# plot variable influence
summary(fit,n.trees=1)         # based on the first tree

##             var   rel.inf
## 1    x.REVOLKED 29.351645
## 2     x.MVR_PTS 27.070913
## 3    x.BLUEBOOK 10.417508
## 4        x.AGE3  7.397987
## 5   x.BLUEBOOK3  5.486209
## 6        x.AREA  3.616030
## 7  x.MAX_EDUC_2  3.229753
## 8    x.TRAVTIME  2.678981
## 9  x.JOBCLASS_7  2.623611
## 10  x.HOME_VAL2  2.186072
## 11     x.INCOME  1.841182
## 12  x.BLUEBOOK2  1.118888
## 13  x.HOME_VAL3  1.054774
## 14  x.SAMEHOME2  0.993953
## 15   x.HOME_VAL  0.932493
## 16 x.CAR_TYPE_2  0.000000
## 17 x.CAR_TYPE_3  0.000000
## 18 x.CAR_TYPE_4  0.000000
## 19 x.CAR_TYPE_5  0.000000
## 20 x.CAR_TYPE_6  0.000000
## 21 x.JOBCLASS_3  0.000000
## 22 x.JOBCLASS_4  0.000000
## 23 x.JOBCLASS_5  0.000000
## 24 x.JOBCLASS_6  0.000000
## 25 x.JOBCLASS_8  0.000000
## 26 x.JOBCLASS_9  0.000000
## 27 x.MAX_EDUC_3  0.000000
## 28 x.MAX_EDUC_4  0.000000
## 29 x.MAX_EDUC_5  0.000000
## 30   x.KIDSDRIV  0.000000
## 31  x.KIDSDRIV2  0.000000
## 32  x.KIDSDRIV3  0.000000
## 33  x.TRAVTIME2  0.000000
## 34  x.TRAVTIME3  0.000000
## 35    x.NPOLICY  0.000000
## 36   x.NPOLICY2  0.000000
## 37   x.NPOLICY3  0.000000
## 38   x.MVR_PTS2  0.000000
## 39   x.MVR_PTS3  0.000000
## 40        x.AGE  0.000000
## 41       x.AGE2  0.000000
## 42   x.HOMEKIDS  0.000000
## 43  x.HOMEKIDS2  0.000000
## 44  x.HOMEKIDS3  0.000000
## 45        x.YOJ  0.000000
## 46       x.YOJ2  0.000000
## 47       x.YOJ3  0.000000
## 48    x.INCOME2  0.000000
## 49    x.INCOME3  0.000000
## 50   x.SAMEHOME  0.000000
## 51  x.SAMEHOME3  0.000000
## 52    x.CAR_USE  0.000000
## 53    x.RED_CAR  0.000000
## 54     x.GENDER  0.000000
## 55    x.MARRIED  0.000000
## 56    x.PARENT1  0.000000
summary(fit,n.trees=best.iter) # based on the estimated best number of trees

##             var     rel.inf
## 1    x.REVOLKED 34.75532285
## 2     x.MVR_PTS 20.90084985
## 3   x.BLUEBOOK2  3.60367561
## 4   x.TRAVTIME2  3.45726498
## 5    x.BLUEBOOK  3.00356665
## 6   x.HOME_VAL3  2.16967149
## 7      x.INCOME  1.93941539
## 8   x.HOME_VAL2  1.91162078
## 9    x.HOME_VAL  1.87478172
## 10    x.INCOME3  1.86828797
## 11        x.AGE  1.78601375
## 12    x.CAR_USE  1.77605653
## 13       x.AGE2  1.76699488
## 14    x.INCOME2  1.70185223
## 15  x.BLUEBOOK3  1.55137845
## 16  x.TRAVTIME3  1.54019657
## 17 x.JOBCLASS_7  1.50680593
## 18   x.TRAVTIME  1.49150631
## 19       x.AGE3  1.47500617
## 20       x.YOJ3  1.35905074
## 21  x.SAMEHOME2  1.16758188
## 22       x.YOJ2  1.07284960
## 23  x.SAMEHOME3  0.97785583
## 24   x.SAMEHOME  0.95544938
## 25        x.YOJ  0.77660014
## 26       x.AREA  0.58298262
## 27    x.NPOLICY  0.51644298
## 28 x.MAX_EDUC_4  0.37871805
## 29 x.MAX_EDUC_3  0.30800492
## 30   x.HOMEKIDS  0.27854732
## 31     x.GENDER  0.24192356
## 32 x.MAX_EDUC_2  0.23989966
## 33    x.MARRIED  0.22518150
## 34 x.CAR_TYPE_3  0.20235934
## 35 x.CAR_TYPE_5  0.13154223
## 36 x.CAR_TYPE_2  0.11749727
## 37 x.CAR_TYPE_4  0.07014899
## 38 x.JOBCLASS_8  0.05862168
## 39 x.CAR_TYPE_6  0.05702679
## 40   x.KIDSDRIV  0.04873483
## 41    x.RED_CAR  0.04368389
## 42 x.JOBCLASS_9  0.03392040
## 43 x.JOBCLASS_6  0.02263796
## 44 x.JOBCLASS_3  0.01639964
## 45 x.MAX_EDUC_5  0.01360515
## 46 x.JOBCLASS_5  0.01221596
## 47    x.PARENT1  0.01024956
## 48 x.JOBCLASS_4  0.00000000
## 49  x.KIDSDRIV2  0.00000000
## 50  x.KIDSDRIV3  0.00000000
## 51   x.NPOLICY2  0.00000000
## 52   x.NPOLICY3  0.00000000
## 53   x.MVR_PTS2  0.00000000
## 54   x.MVR_PTS3  0.00000000
## 55  x.HOMEKIDS2  0.00000000
## 56  x.HOMEKIDS3  0.00000000

model prediction / scoring

f.predict <- predict.TDboost(fit, test_auto, best.iter)

least squares error

print(sum((test_auto$y - f.predict)^2))
## [1] 64227.63