记首次kaggle夺银经历

Posted by Bigzhao on December 18, 2017

Introduction

Kaggle 是目前最大的 Data Scientist 聚集地。很多公司会拿出自家的数据并提供奖金,在 Kaggle 上组织数据竞赛。我最近完成了第一次比赛,,在 5,169 个参赛队伍中排名第 131 位(top3%,首次夺银哈哈哈)。因为是第一次参赛,所以对这个成绩我已经很满意了。

接下来简单地介绍这个比赛,我参加的比赛叫做Porto Seguro’s Safe Driver Prediction,主要是根据历史数据建立一个模型,预测驾驶员将在明年发起汽车保险索赔的可能性。比赛的评估标准是gini系数。

Data Exploration

公开kernel上有比较详细的数据可视化教程[点我]

由于该比赛的数据特征的真实含义被隐藏,因此无法在特征工程上下太大功夫。

数据预处理

基于oliver的kernel,根据相关度去掉ps_calc一系列特征

# from olivier
train_features = [
    "ps_car_13",  #            : 1571.65 / shadow  609.23
	"ps_reg_03",  #            : 1408.42 / shadow  511.15
	"ps_ind_05_cat",  #        : 1387.87 / shadow   84.72
	"ps_ind_03",  #            : 1219.47 / shadow  230.55
	"ps_ind_15",  #            :  922.18 / shadow  242.00
	"ps_reg_02",  #            :  920.65 / shadow  267.50
	"ps_car_14",  #            :  798.48 / shadow  549.58
	"ps_car_12",  #            :  731.93 / shadow  293.62
	"ps_car_01_cat",  #        :  698.07 / shadow  178.72
	"ps_car_07_cat",  #        :  694.53 / shadow   36.35
	"ps_ind_17_bin",  #        :  620.77 / shadow   23.15
	"ps_car_03_cat",  #        :  611.73 / shadow   50.67
	"ps_reg_01",  #            :  598.60 / shadow  178.57
	"ps_car_15",  #            :  593.35 / shadow  226.43
	"ps_ind_01",  #            :  547.32 / shadow  154.58
	"ps_ind_16_bin",  #        :  475.37 / shadow   34.17
	"ps_ind_07_bin",  #        :  435.28 / shadow   28.92
	"ps_car_06_cat",  #        :  398.02 / shadow  212.43
	"ps_car_04_cat",  #        :  376.87 / shadow   76.98
	"ps_ind_06_bin",  #        :  370.97 / shadow   36.13
	"ps_car_09_cat",  #        :  214.12 / shadow   81.38
	"ps_car_02_cat",  #        :  203.03 / shadow   26.67
	"ps_ind_02_cat",  #        :  189.47 / shadow   65.68
	"ps_car_11",  #            :  173.28 / shadow   76.45
	"ps_car_05_cat",  #        :  172.75 / shadow   62.92
	"ps_calc_09",  #           :  169.13 / shadow  129.72
	"ps_calc_05",  #           :  148.83 / shadow  120.68
	"ps_ind_08_bin",  #        :  140.73 / shadow   27.63
	"ps_car_08_cat",  #        :  120.87 / shadow   28.82
	"ps_ind_09_bin",  #        :  113.92 / shadow   27.05
	"ps_ind_04_cat",  #        :  107.27 / shadow   37.43
	"ps_ind_18_bin",  #        :   77.42 / shadow   25.97
	"ps_ind_12_bin",  #        :   39.67 / shadow   15.52
	"ps_ind_14",  #            :   37.37 / shadow   16.65
]
# add combinations
combs = [
    ('ps_reg_01', 'ps_car_02_cat'),  
    ('ps_reg_01', 'ps_car_04_cat'),
]
# Process data
id_test = test_df['id'].values
id_train = train_df['id'].values
y = train_df['target']

start = time.time()
for n_c, (f1, f2) in enumerate(combs):
    name1 = f1 + "_plus_" + f2
    print('current feature %60s %4d in %5.1f'
          % (name1, n_c + 1, (time.time() - start) / 60), end='')
    print('\r' * 75, end='')
    train_df[name1] = train_df[f1].apply(lambda x: str(x)) + "_" + train_df[f2].apply(lambda x: str(x))
    test_df[name1] = test_df[f1].apply(lambda x: str(x)) + "_" + test_df[f2].apply(lambda x: str(x))
    # Label Encode
    lbl = LabelEncoder()
    lbl.fit(list(train_df[name1].values) + list(test_df[name1].values))
    train_df[name1] = lbl.transform(list(train_df[name1].values))
    test_df[name1] = lbl.transform(list(test_df[name1].values))

    train_features.append(name1)

X = train_df[train_features]
test_df = test_df[train_features]

f_cats = [f for f in X.columns if "_cat" in f]

模型选择

在这次比赛中,尝试了DNN,XGBOOST,LIGHTGBM,CATBOOST,LIBFFM,模型聚合尝试了stacking,voting,blending。。结果发现一开始voting的结果是最好的,但是后面加模型的时候stacking就比voting好了。最后的提交方案是DNN+stacking+voting+根据public成绩比较靠前的几次结果的平均值

其中DNN是用了别人的模型,然后稍微修改了一下。模型代码如下:


def build_embedding_network():

    models = []

    model_ps_ind_02_cat = Sequential()
    model_ps_ind_02_cat.add(Embedding(5, 3, input_length=1))
    model_ps_ind_02_cat.add(Reshape(target_shape=(3,)))
    models.append(model_ps_ind_02_cat)

    model_ps_ind_04_cat = Sequential()
    model_ps_ind_04_cat.add(Embedding(3, 2, input_length=1))
    model_ps_ind_04_cat.add(Reshape(target_shape=(2,)))
    models.append(model_ps_ind_04_cat)

    model_ps_ind_05_cat = Sequential()
    model_ps_ind_05_cat.add(Embedding(8, 5, input_length=1))
    model_ps_ind_05_cat.add(Reshape(target_shape=(5,)))
    models.append(model_ps_ind_05_cat)

    model_ps_car_01_cat = Sequential()
    model_ps_car_01_cat.add(Embedding(13, 7, input_length=1))
    model_ps_car_01_cat.add(Reshape(target_shape=(7,)))
    models.append(model_ps_car_01_cat)

    model_ps_car_02_cat = Sequential()
    model_ps_car_02_cat.add(Embedding(3, 2, input_length=1))
    model_ps_car_02_cat.add(Reshape(target_shape=(2,)))
    models.append(model_ps_car_02_cat)

    model_ps_car_03_cat = Sequential()
    model_ps_car_03_cat.add(Embedding(3, 2, input_length=1))
    model_ps_car_03_cat.add(Reshape(target_shape=(2,)))
    models.append(model_ps_car_03_cat)

    model_ps_car_04_cat = Sequential()
    model_ps_car_04_cat.add(Embedding(10, 5, input_length=1))
    model_ps_car_04_cat.add(Reshape(target_shape=(5,)))
    models.append(model_ps_car_04_cat)

    model_ps_car_05_cat = Sequential()
    model_ps_car_05_cat.add(Embedding(3, 2, input_length=1))
    model_ps_car_05_cat.add(Reshape(target_shape=(2,)))
    models.append(model_ps_car_05_cat)

    model_ps_car_06_cat = Sequential()
    model_ps_car_06_cat.add(Embedding(18, 8, input_length=1))
    model_ps_car_06_cat.add(Reshape(target_shape=(8,)))
    models.append(model_ps_car_06_cat)

    model_ps_car_07_cat = Sequential()
    model_ps_car_07_cat.add(Embedding(3, 2, input_length=1))
    model_ps_car_07_cat.add(Reshape(target_shape=(2,)))
    models.append(model_ps_car_07_cat)

    model_ps_car_09_cat = Sequential()
    model_ps_car_09_cat.add(Embedding(6, 3, input_length=1))
    model_ps_car_09_cat.add(Reshape(target_shape=(3,)))
    models.append(model_ps_car_09_cat)

    model_ps_car_10_cat = Sequential()
    model_ps_car_10_cat.add(Embedding(3, 2, input_length=1))
    model_ps_car_10_cat.add(Reshape(target_shape=(2,)))
    models.append(model_ps_car_10_cat)

    model_ps_car_11_cat = Sequential()
    model_ps_car_11_cat.add(Embedding(104, 10, input_length=1))
    model_ps_car_11_cat.add(Reshape(target_shape=(10,)))
    models.append(model_ps_car_11_cat)

    model_rest = Sequential()
    model_rest.add(Dense(16, input_dim=24))
    models.append(model_rest)

    model = Sequential()
    model.add(Merge(models, mode='concat'))
    model.add(Dense(100))
    model.add(Activation('relu'))
    model.add(Dropout(.35))
    model.add(Dense(20))
    model.add(Activation('relu'))
    model.add(Dropout(.15))
    model.add(Dense(10))
    model.add(Activation('relu'))
    model.add(Dropout(.15))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam')

    return model

调参

使用random search方法调参,这样的好处是能给我带来多组不错的参数。

下面是随机调xgboost的例子

params_set = []
ginis = []
OPTIMIZE_ROUNDS = True
for i in range(0, 10):
    param_dist = {
            'max_depth': 4,     
            'objective': 'binary:logistic',
            'learning_rate': choice([0.2, 0.1, 0.07, 0.06, 0.08, 0.05, 0.03]),
            'subsample': choice([0.8, 0.75, 0.85, 0.7, 0.8]),
            'min_child_weight': choice([0.77, 0.7, 0.8, 0.9, 1]),
            'scale_pos_weight':choice([1, 1.3, 1.6, 1.7]),
            'gamma':10,
            'reg_alpha' : 8,
            'reg_lambda':1.3,
            'n_estimators': 1000
    }

    y_valid_pred = 0*y
    y_test_pred = 0
    xgb_model = xgb.XGBClassifier(**param_dist)

    for i, (train_index, test_index) in enumerate(kf.split(train_df)):
        # Create data for this fold
        y_train, y_valid = y.iloc[train_index].copy(), y.iloc[test_index]
        X_train, X_valid = X.iloc[train_index,:].copy(), X.iloc[test_index,:].copy()
        X_test = test_df.copy()
        print( "\nFold ", i)

        # Enocode data
        for f in f_cats:
            X_train[f + "_avg"], X_valid[f + "_avg"], X_test[f + "_avg"] = target_encode(
                                                            trn_series=X_train[f],
                                                            val_series=X_valid[f],
                                                            tst_series=X_test[f],
                                                            target=y_train,
                                                            min_samples_leaf=200,
                                                            smoothing=10,
                                                            noise_level=0
                                                            )
        # Run model for this fold
        if OPTIMIZE_ROUNDS:
            eval_set=[(X_valid,y_valid)]
            fit_model = xgb_model.fit( X_train, y_train,
                                   eval_set=eval_set,
                                   eval_metric=gini_xgb,
                                   early_stopping_rounds=100,

                                   verbose = 100
                                 )
            print( "  Best N trees = ", fit_model.best_iteration )
            print( "  Best gini = ", fit_model.best_score )
        else:
            fit_model = xgb_model.fit( X_train, y_train )

        # Generate validation predictions for this fold
        pred = fit_model.predict_proba(X_valid)[:,1]
        print( "  Gini = ", eval_gini(y_valid, pred) )
        y_valid_pred.iloc[test_index] = pred

        # Accumulate test set predictions
        y_test_pred += fit_model.predict_proba(X_test)[:,1]

        del X_test, X_train, X_valid, y_train

    y_test_pred /= K  # Average test set predictions
    res =  eval_gini(y, y_valid_pred)
    print( "\nGini: {} params:{}:".format(res, ip)  )
    params_set.append(param_dist.copy())
    ginis.append(res)

Submit

因为评估标准是gini,所以最后采取rank的方法去做加权平均。

indir = 'input\my_submit/'
infiles = [
'NN_EntityEmbed_10fold-sub.csv', 'voting_res_mean.csv', 'stack_test_8folds.csv', 'rank.csv','stack_002_test.csv'
]
for i, f in enumerate(infiles):
    subf = pd.read_csv(indir + infiles[i])
    if not i:
        sub = subf
    else:
        sub = pd.merge(sub, subf, on='id', suffixes=['',str(i)])
# oof.rename(columns={'target':'target0'}, inplace=True)
# oof.head()
sub['target'] = (sub.drop('id', axis=1).rank() / sub.shape[0]).mean(axis=1)
sub[['id', 'target']].to_csv('my_submission_mix.csv', index=False)

总结

  • 计算资源很重要,没有GPU很无奈。。。
  • 多关注比赛论坛,开工kernel有很多很好的idea
  • 怎样权衡public成绩和自己的cv成绩。。。这场比赛很多人overfitting可能就是因为太过于看重public排名。。
  • 中间尝试blending的时候成绩非常好,但是public排名超级烂,可能的原因是我blending没有使用同一个CV导致数据泄露,这个还得研究研究。。。
  • 看到排名前三运用神经网络非常厉害,在我这里神经网络得成绩怎么调都比不过boost树,还得继续学习学习。。
  • stacking/voting的时候少部分是不同的模型,大部分是相同模型+不同参数,不知道这样会不会相关度太高而导致stacking的作用没有很好地发挥。
  • 尝试过两层stacking,后来发现实在是太慢了跑不下去了。。。