记录第一次kaggle之旅-泰坦尼克号预测

Posted by Bigzhao on August 9, 2017
  • ipynb代码地址:https://github.com/bigzhao/Bigzhao-get-started-with-kaggle/blob/master/Titanic.ipynb
import pandas as pd
from pandas import DataFrame, Series
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
%matplotlib inline
train_data = pd.read_csv(u'E:\\资料书籍\\天池\\泰坦尼克号\\train.csv', sep=',')
test_data = pd.read_csv(u'E:\\资料书籍\\天池\\泰坦尼克号\\test.csv', sep=',')

首先进行数据预处理

先合并数据,统一做处理,因为在别的地方已经进行过数据的分析了,所以这里就不一一阐述。

主要步骤如下:

  • 首先提出名字里面的前缀,例如‘Mr’ ‘Mrs’,名字前缀能一定程度反映乘客的年龄(Mr. Master的不同,后者是小男孩)、性别、职位(Cap, Rev主教)等。
def combine_data(train_data, test_data):
    combined_data = train_data.append(test_data)
    combined_data.reset_index(inplace=True)
    return combined_data
labels = train_data.Survived
train_data.drop('Survived', axis=1, inplace=True)
combined_data = combine_data(train_data, test_data)
def exract_name_prefix(data):
    data['NamePrefix'] = data.Name.apply(lambda x: x.split(',')[1].strip().split('.')[0])
exract_name_prefix(combined_data)
combined_data.Fare.fillna(np.median(combined_data.Fare[combined_data.Fare.notnull()]), inplace=True)
def class_name_predix(x):
    if x in ['Ms', 'Lady', 'the Countess', 'Mrs', 'Dona', 'Mme']:
        return 'Mrs'
    if x in ['Mlle', 'Miss']:
        return 'Miss'
    if x in ['Capt', 'Col',  'Major', 'Dr', 'Rev']:
        return 'Officer'
    if x in ['Don', 'Jonkheer']:
        return 'Royalty'
    if x in ['Sir', 'Mr']:
        return 'Mr'
    else:
        return x
combined_data.NamePrefix = combined_data.NamePrefix.apply(class_name_predix)
  • 接下来增加Mother Family Singleton等字段
mrs = (combined_data.NamePrefix == 'Mrs').values
parch = (combined_data.Parch >= 1).values
combined_data['Mother'] = np.array(map(lambda x: int(x[0] and x[1]), zip(mrs, parch)))
# 计算家庭size
family = combined_data.SibSp.values + combined_data.Parch.values
combined_data['Family'] = family
combined_data['Singleton'] = np.array(map(int, combined_data.Family == 0))
combined_data['Family_Size'] = combined_data.Family.apply(lambda x: 'Big' if x > 3 else 'Small')
Family_Size_dummies = pd.get_dummies(combined_data['Family_Size'], prefix='Family_Size')
combined_data = pd.concat([combined_data, Family_Size_dummies], axis=1)
combined_data.drop('Family_Size', axis=1, inplace=True)
NamePrefix_dummies = pd.get_dummies(combined_data['NamePrefix'], prefix='NamePrefix')
combined_data = pd.concat([combined_data, NamePrefix_dummies], axis=1)
combined_data.drop('NamePrefix', axis=1, inplace=True)
# le_NamePrefix = preprocessing.LabelEncoder().fit(combined_data.NamePrefix)
# combined_data.NamePrefix = le_NamePrefix.transform(combined_data.NamePrefix)
  • 利用已知的数据来预测缺失的年龄,因为年龄缺失数目较多。
from sklearn.ensemble import RandomForestRegressor

### 使用 RandomForestClassifier 填补缺失的年龄属性
def set_missing_ages(df):

    # 把已有的数值型特征取出来丢进Random Forest Regressor中
    age_df = df[['Age','Fare', 'Parch', 'SibSp', 'Pclass', 'NamePrefix_Mrs', 'NamePrefix_Miss', 'Family_Size_Big','Family_Size_Small',
                 'NamePrefix_Officer', 'NamePrefix_Royalty', 'NamePrefix_Mr', 'NamePrefix_Master', 'Mother', 'Singleton', 'Family']]

    # 乘客分成已知年龄和未知年龄两部分
    known_age = age_df[age_df.Age.notnull()].as_matrix()
    unknown_age = age_df[age_df.Age.isnull()].as_matrix()

    # y即目标年龄
    y = known_age[:, 0]

    # X即特征属性值
    X = known_age[:, 1:]

    # fit到RandomForestRegressor之中
    rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)
    rfr.fit(X, y)

    # 用得到的模型进行未知年龄结果预测
    predictedAges = rfr.predict(unknown_age[:, 1::])

    # 用得到的预测结果填补原缺失数据
    df.loc[ (df.Age.isnull()), 'Age' ] = predictedAges
set_missing_ages(combined_data)
  • 增加Share_Ticket这一特征,意义是跟别人共享票务的说明不是一个人,按理来说应该得救几率大一点
  • Share_ticket_survived的意思是共享票的另外的人有没有存活。
from collections import Counter
Ticket_Num_Dict = Counter(combined_data.Ticket)
combined_data['Share_Ticket'] = combined_data.Ticket.apply(lambda x: 1 if Ticket_Num_Dict[x] >= 2 else 0)
# 处理Ticket 这一组合特征

# Ticket_dummies = pd.get_dummies(combined_data['Ticket'], prefix='Ticket')
# combined_data = pd.concat([combined_data, Ticket_dummies], axis=1)
# combined_data.drop('Ticket', axis=1, inplace=True)
train_data_bak = pd.read_csv(u'E:\\资料书籍\\天池\\泰坦尼克号\\train.csv', sep=',')
share_ticket_survived = []
for index, row in combined_data.iterrows():
    if row.Share_Ticket == 1 and len(train_data_bak[train_data_bak.Ticket == row.Ticket].Survived == 1) > 0:
        share_ticket_survived.append(1)
    else:
        share_ticket_survived.append(0)
combined_data['Share_ticket_survived'] = np.array(share_ticket_survived)       
combined_data.Cabin.fillna('M0', inplace=True)
combined_data.drop(['Embarked', 'Name', 'PassengerId'], axis=1, inplace=True)
combined_data.head()
index Pclass Sex Age SibSp Parch Ticket Fare Cabin Mother ... Family_Size_Big Family_Size_Small NamePrefix_Master NamePrefix_Miss NamePrefix_Mr NamePrefix_Mrs NamePrefix_Officer NamePrefix_Royalty Share_Ticket Share_ticket_survived
0 0 3 male 22.0 1 0 A/5 21171 7.2500 M0 0 ... 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0 0
1 1 1 female 38.0 1 0 PC 17599 71.2833 C85 0 ... 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 1 1
2 2 3 female 26.0 0 0 STON/O2. 3101282 7.9250 M0 0 ... 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0 0
3 3 1 female 35.0 1 0 113803 53.1000 C123 0 ... 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 1 1
4 4 3 male 35.0 0 0 373450 8.0500 M0 0 ... 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0 0

5 rows × 22 columns

combined_data.drop(['Ticket'], axis=1, inplace=True)
def process_cabin_num(x):
    x = x.split(' ')[0] # 这一步是为了处理有多个Cabin的情况 只取第一个
    if len(x) <= 1:
        return 0
    else:
        return float(x[1:])

def process_cabin(combined):
    # mapping each Cabin value with the cabin letter
    carbin_num = combined.Cabin.apply(process_cabin_num)
    combined['Cabin_Numer'] = carbin_num
    combined['Cabin'] = combined['Cabin'].map(lambda c : c[0])

process_cabin(combined_data)
# # dummy encoding ...
cabin_dummies = pd.get_dummies(combined_data['Cabin'], prefix='Cabin')
combined_data = pd.concat([combined_data, cabin_dummies], axis=1)
combined_data.drop('Cabin', axis=1, inplace=True)
# le_cabin = preprocessing.LabelEncoder().fit(combined_data.Cabin)
# combined_data.Cabin = le_cabin.transform(combined_data.Cabin)
combined_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 30 columns):
index                    1309 non-null int64
Pclass                   1309 non-null int64
Sex                      1309 non-null object
Age                      1309 non-null float64
SibSp                    1309 non-null int64
Parch                    1309 non-null int64
Fare                     1309 non-null float64
Mother                   1309 non-null int32
Family                   1309 non-null int64
Singleton                1309 non-null int32
Family_Size_Big          1309 non-null float64
Family_Size_Small        1309 non-null float64
NamePrefix_Master        1309 non-null float64
NamePrefix_Miss          1309 non-null float64
NamePrefix_Mr            1309 non-null float64
NamePrefix_Mrs           1309 non-null float64
NamePrefix_Officer       1309 non-null float64
NamePrefix_Royalty       1309 non-null float64
Share_Ticket             1309 non-null int64
Share_ticket_survived    1309 non-null int32
Cabin_Numer              1309 non-null float64
Cabin_A                  1309 non-null float64
Cabin_B                  1309 non-null float64
Cabin_C                  1309 non-null float64
Cabin_D                  1309 non-null float64
Cabin_E                  1309 non-null float64
Cabin_F                  1309 non-null float64
Cabin_G                  1309 non-null float64
Cabin_M                  1309 non-null float64
Cabin_T                  1309 non-null float64
dtypes: float64(20), int32(3), int64(6), object(1)
memory usage: 291.5+ KB
# 处理性别数据
combined_data.Sex = combined_data.Sex.map({'male':0, 'female': 1})
combined_data.drop('index', axis=1, inplace=True)
combined_data.head()
Pclass Sex Age SibSp Parch Fare Mother Family Singleton Family_Size_Big ... Cabin_Numer Cabin_A Cabin_B Cabin_C Cabin_D Cabin_E Cabin_F Cabin_G Cabin_M Cabin_T
0 3 0 22.0 1 0 7.2500 0 1 0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
1 1 1 38.0 1 0 71.2833 0 1 0 0.0 ... 85.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0
2 3 1 26.0 0 0 7.9250 0 0 1 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
3 1 1 35.0 1 0 53.1000 0 1 0 0.0 ... 123.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0
4 3 0 35.0 0 0 8.0500 0 0 1 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0

5 rows × 29 columns

  • 将两个重要特征PCLASS 和Sex结合
# 建造新特征PCLASS 与 Sex的结合
def join_sex_pclass(sex, pclass):
    sex_dict = {0: 'Male', 1: 'Female'}
    pclass_dict = {1: 'High', 2: 'Mid', 3: 'Low'}
    return '{}_{}'.format(sex_dict[sex], pclass_dict[pclass])
def create_sex_pclass_feat(data):
    new_feat = []
    for index, row in data.iterrows():   # 获取每行的index、row
        new_feat.append(join_sex_pclass(row.Sex, row.Pclass))
    return pd.Series(np.array(new_feat))
combined_data['Sex_Pclass'] = create_sex_pclass_feat(combined_data)
# # 处理Sex_Pclass 这一组合特征

sex_pclass_dummies = pd.get_dummies(combined_data['Sex_Pclass'], prefix='Sex_Pclass')
combined_data = pd.concat([combined_data, sex_pclass_dummies], axis=1)
combined_data.drop('Sex_Pclass', axis=1, inplace=True)
# le_sp = preprocessing.LabelEncoder().fit(combined_data.Sex_Pclass)
# combined_data.Sex_Pclass = le_sp.transform(combined_data.Sex_Pclass)
# normalized_Family(combined_data)
combined_data['Child'] = combined_data.Age.apply(lambda x: 1 if x <12 else 0)
combined_data.head()
Pclass Sex Age SibSp Parch Fare Mother Family Singleton Family_Size_Big ... Cabin_G Cabin_M Cabin_T Sex_Pclass_Female_High Sex_Pclass_Female_Low Sex_Pclass_Female_Mid Sex_Pclass_Male_High Sex_Pclass_Male_Low Sex_Pclass_Male_Mid Child
0 3 0 -0.577952 1 0 -0.503291 0 1 0 0.0 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0
1 1 1 0.599216 1 0 0.734744 0 1 0 0.0 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0
2 3 1 -0.283660 0 0 -0.490240 0 0 1 0.0 ... 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0
3 1 1 0.378497 1 0 0.383183 0 1 0 0.0 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0
4 3 0 0.378497 0 0 -0.487824 0 0 1 0.0 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0

5 rows × 36 columns

  • 接下来分析数据
train_data = combined_data.loc[:890]
test_data = combined_data.loc[891:]
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectFromModel
clf = GradientBoostingClassifier(n_estimators=50, max_features='sqrt')
clf = clf.fit(train_data, labels)

model = SelectFromModel(clf, prefit=True)
train_reduced = model.transform(train_data)
test_reduced =  model.transform(test_data)
train_reduced.shape
(891L, 11L)

接下来运用stacking集成5个模型进行预测

from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.cross_validation import KFold
# Some useful parameters which will come in handy later on
ntrain = train_reduced.shape[0]
ntest = test_reduced.shape[0]
SEED = 0 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction
kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED)

# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)

    def fit(self,x,y):
        return self.clf.fit(x,y)

    def feature_importances(self,x,y):
        return (self.clf.fit(x,y).feature_importances_)
def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)
# Put in our parameters for said classifiers
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
     'warm_start': True,
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':500,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}

# Gradient Boosting parameters
gb_params = {
    'n_estimators': 500,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

# Support Vector Classifier parameters
svc_params = {
    'kernel' : 'linear',
    'C' : 0.025
    }
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params)
x_train = train_reduced # Creates an array of the train data
x_test = test_reduced # Creats an array of the test data
y_train = labels.ravel()
# Create our OOF train and test predictions. These base results will be used as new features
et_oof_train, et_oof_test = get_oof(et, x_train, y_train, x_test) # Extra Trees
rf_oof_train, rf_oof_test = get_oof(rf,x_train, y_train, x_test) # Random Forest
ada_oof_train, ada_oof_test = get_oof(ada, x_train, y_train, x_test) # AdaBoost
gb_oof_train, gb_oof_test = get_oof(gb,x_train, y_train, x_test) # Gradient Boost
svc_oof_train, svc_oof_test = get_oof(svc,x_train, y_train, x_test) # Support Vector Classifier

print("Training is complete")
Training is complete
rf_features = rf.feature_importances(x_train,y_train)
et_features = et.feature_importances(x_train, y_train)
ada_features = ada.feature_importances(x_train, y_train)
gb_features = gb.feature_importances(x_train,y_train)
# rf_features = ' 0.10117536  0.19737276  0.0500577   0.01758493  0.14145643  0.06167856\
#   0.09210271  0.06871121  0.23547256  0.00354938  0.01311831  0.01772008'
# et_features = ' 0.10295689  0.31295519  0.03997998  0.02517426  0.06224174  0.05172749\
#   0.06910707  0.04409819  0.22901159  0.00888276  0.01756171  0.03630312'
# ada_features = ' 0.01   0.01   0.026  0.     0.822  0.018  0.028  0.034  0.042  0.006\
#   0.004  0. '
# gb_features = '0.01068577  0.0241477   0.12169894  0.01267324  0.59006613  0.05527318\
#   0.04954347  0.05730464  0.04142735  0.01500765  0.02054994  0.00162199'
# rf_features = map(float, rf_features.strip().split())
# et_features = map(float, et_features.strip().split())
# ada_features = map(float, ada_features.strip().split())
# gb_features = map(float, gb_features.strip().split())
cols = train_data.columns.values
print len(cols)
# Create a dataframe with features
feature_dataframe = pd.DataFrame( {'features': cols,
     'Random Forest feature importances': rf_features,
     'Extra Trees  feature importances': et_features,
      'AdaBoost feature importances': ada_features,
    'Gradient Boost feature importances': gb_features
    })
36

利用plotly库显示各个特征的重要性和平均下来的重要性

  • plotly是一个js库,提供python接口,交互很棒,语法很长。。。文档一般般。。。
import plotly.graph_objs as go
import plotly.offline as py
py.init_notebook_mode(connected=True)
# Scatter plot
trace = go.Scatter(
    y = feature_dataframe['Random Forest feature importances'].values,
    x = feature_dataframe['features'].values,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1,
        size = 25,
#       size= feature_dataframe['AdaBoost feature importances'].values,
        #color = np.random.randn(500), #set color equal to a variable
        color = feature_dataframe['Random Forest feature importances'].values,
        colorscale='Portland',
        showscale=True
    ),
    text = feature_dataframe['features'].values
)
data = [trace]

layout= go.Layout(
    autosize= True,
    title= 'Random Forest Feature Importance',
    hovermode= 'closest',
#     xaxis= dict(
#         title= 'Pop',
#         ticklen= 5,
#         zeroline= False,
#         gridwidth= 2,
#     ),
    yaxis=dict(
        title= 'Feature Importance',
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')

# Scatter plot
trace = go.Scatter(
    y = feature_dataframe['Extra Trees  feature importances'].values,
    x = feature_dataframe['features'].values,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1,
        size = 25,
#       size= feature_dataframe['AdaBoost feature importances'].values,
        #color = np.random.randn(500), #set color equal to a variable
        color = feature_dataframe['Extra Trees  feature importances'].values,
        colorscale='Portland',
        showscale=True
    ),
    text = feature_dataframe['features'].values
)
data = [trace]

layout= go.Layout(
    autosize= True,
    title= 'Extra Trees Feature Importance',
    hovermode= 'closest',
#     xaxis= dict(
#         title= 'Pop',
#         ticklen= 5,
#         zeroline= False,
#         gridwidth= 2,
#     ),
    yaxis=dict(
        title= 'Feature Importance',
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')

# Scatter plot
trace = go.Scatter(
    y = feature_dataframe['AdaBoost feature importances'].values,
    x = feature_dataframe['features'].values,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1,
        size = 25,
#       size= feature_dataframe['AdaBoost feature importances'].values,
        #color = np.random.randn(500), #set color equal to a variable
        color = feature_dataframe['AdaBoost feature importances'].values,
        colorscale='Portland',
        showscale=True
    ),
    text = feature_dataframe['features'].values
)
data = [trace]

layout= go.Layout(
    autosize= True,
    title= 'AdaBoost Feature Importance',
    hovermode= 'closest',
#     xaxis= dict(
#         title= 'Pop',
#         ticklen= 5,
#         zeroline= False,
#         gridwidth= 2,
#     ),
    yaxis=dict(
        title= 'Feature Importance',
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')

# Scatter plot
trace = go.Scatter(
    y = feature_dataframe['Gradient Boost feature importances'].values,
    x = feature_dataframe['features'].values,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1,
        size = 25,
#       size= feature_dataframe['AdaBoost feature importances'].values,
        #color = np.random.randn(500), #set color equal to a variable
        color = feature_dataframe['Gradient Boost feature importances'].values,
        colorscale='Portland',
        showscale=True
    ),
    text = feature_dataframe['features'].values
)
data = [trace]

layout= go.Layout(
    autosize= True,
    title= 'Gradient Boosting Feature Importance',
    hovermode= 'closest',
#     xaxis= dict(
#         title= 'Pop',
#         ticklen= 5,
#         zeroline= False,
#         gridwidth= 2,
#     ),
    yaxis=dict(
        title= 'Feature Importance',
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')
# Create the new column containing the average of values

feature_dataframe['mean'] = feature_dataframe.mean(axis= 1) # axis = 1 computes the mean row-wise
feature_dataframe.head(3)
y = feature_dataframe['mean'].values
x = feature_dataframe['features'].values
data = [go.Bar(
            x= x,
             y= y,
            width = 0.5,
            marker=dict(
               color = feature_dataframe['mean'].values,
            colorscale='Portland',
            showscale=True,
            reversescale = False
            ),
            opacity=0.6
        )]

layout= go.Layout(
    autosize= True,
    title= 'Barplots of Mean Feature Importance',
    hovermode= 'closest',
#     xaxis= dict(
#         title= 'Pop',
#         ticklen= 5,
#         zeroline= False,
#         gridwidth= 2,
#     ),
    yaxis=dict(
        title= 'Feature Importance',
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='bar-direct-labels')
base_predictions_train = pd.DataFrame( {'RandomForest': rf_oof_train.ravel(),
     'ExtraTrees': et_oof_train.ravel(),
     'AdaBoost': ada_oof_train.ravel(),
      'GradientBoost': gb_oof_train.ravel()
    })
base_predictions_train.head()
AdaBoost ExtraTrees GradientBoost RandomForest
0 0.0 0.0 0.0 0.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 0.0 0.0 0.0 0.0
data = [
    go.Heatmap(
        z= base_predictions_train.astype(float).corr().values ,
        x=base_predictions_train.columns.values,
        y= base_predictions_train.columns.values,
          colorscale='Portland',
            showscale=True,
            reversescale = True
    )
]
py.iplot(data, filename='labelled-heatmap')
x_train = np.concatenate(( et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train, svc_oof_train), axis=1)
x_test = np.concatenate(( et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test, svc_oof_test), axis=1)
import os
mingw_path = 'C:\Program Files\mingw-w64\x86_64-6.3.0-posix-seh-rt_v5-rev1\mingw64\bin'
os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']
import xgboost as xgb
gbm = xgb.XGBClassifier(
    #learning_rate = 0.02,
 n_estimators= 2000,
 max_depth= 4,
 min_child_weight= 2,
 #gamma=1,
 gamma=0.9,                        
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread= -1,
 scale_pos_weight=1).fit(x_train, y_train)
predictions = gbm.predict(x_test)
C:\Anaconda2\lib\site-packages\xgboost-0.6-py2.7.egg\xgboost\sklearn.py:210: DeprecationWarning:

The nthread parameter is deprecated as of version .6.Please use n_jobs instead.nthread is deprecated.
PassengerId = pd.read_csv(u'E:\\资料书籍\\天池\\泰坦尼克号\\test.csv', sep=',').PassengerId
DataFrame({ 'PassengerId': PassengerId, 'Survived': predictions }).to_csv(u'E:\\资料书籍\\天池\\泰坦尼克号\\2017-8-9-20-25.csv', index=False)
def compute_score(clf, X, y, scoring='accuracy'):
    xval = cross_val_score(clf, X, y, cv = 5, scoring=scoring, n_jobs =-1)
    return np.mean(xval)
compute_score(gbm, train_reduced, labels)
0.83164671657823797

总结

这次第一次接触kaggle比赛,还是蛮多收获的,从一开始画图分析,利用随机森林撸出了第一个模型,传上去得分有0.76,当时觉得还不错。然后就开始坑了。。。试过了创建不同的特征,也在网上借鉴了很多经验,但是改善程度最好就到0.78…

发现对Fare做离散化效果不是很好,不知道为什么。

最后减少了特征然后上了Stacking集成了5个模型,最后kaggle得分是0.803左右,后续有继续折腾但是改善不大。不过泰坦尼克作为一个入手项目还是很赞的。

最后上个图纪念一下哈哈哈