- ipynb代码地址:https://github.com/bigzhao/Bigzhao-get-started-with-kaggle/blob/master/Titanic.ipynb
import pandas as pd
from pandas import DataFrame, Series
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
%matplotlib inline
train_data = pd.read_csv(u'E:\\资料书籍\\天池\\泰坦尼克号\\train.csv', sep=',')
test_data = pd.read_csv(u'E:\\资料书籍\\天池\\泰坦尼克号\\test.csv', sep=',')
首先进行数据预处理
先合并数据,统一做处理,因为在别的地方已经进行过数据的分析了,所以这里就不一一阐述。
主要步骤如下:
- 首先提出名字里面的前缀,例如‘Mr’ ‘Mrs’,名字前缀能一定程度反映乘客的年龄(Mr. Master的不同,后者是小男孩)、性别、职位(Cap, Rev主教)等。
def combine_data(train_data, test_data):
combined_data = train_data.append(test_data)
combined_data.reset_index(inplace=True)
return combined_data
labels = train_data.Survived
train_data.drop('Survived', axis=1, inplace=True)
combined_data = combine_data(train_data, test_data)
def exract_name_prefix(data):
data['NamePrefix'] = data.Name.apply(lambda x: x.split(',')[1].strip().split('.')[0])
exract_name_prefix(combined_data)
combined_data.Fare.fillna(np.median(combined_data.Fare[combined_data.Fare.notnull()]), inplace=True)
def class_name_predix(x):
if x in ['Ms', 'Lady', 'the Countess', 'Mrs', 'Dona', 'Mme']:
return 'Mrs'
if x in ['Mlle', 'Miss']:
return 'Miss'
if x in ['Capt', 'Col', 'Major', 'Dr', 'Rev']:
return 'Officer'
if x in ['Don', 'Jonkheer']:
return 'Royalty'
if x in ['Sir', 'Mr']:
return 'Mr'
else:
return x
combined_data.NamePrefix = combined_data.NamePrefix.apply(class_name_predix)
- 接下来增加Mother Family Singleton等字段
mrs = (combined_data.NamePrefix == 'Mrs').values
parch = (combined_data.Parch >= 1).values
combined_data['Mother'] = np.array(map(lambda x: int(x[0] and x[1]), zip(mrs, parch)))
# 计算家庭size
family = combined_data.SibSp.values + combined_data.Parch.values
combined_data['Family'] = family
combined_data['Singleton'] = np.array(map(int, combined_data.Family == 0))
combined_data['Family_Size'] = combined_data.Family.apply(lambda x: 'Big' if x > 3 else 'Small')
Family_Size_dummies = pd.get_dummies(combined_data['Family_Size'], prefix='Family_Size')
combined_data = pd.concat([combined_data, Family_Size_dummies], axis=1)
combined_data.drop('Family_Size', axis=1, inplace=True)
NamePrefix_dummies = pd.get_dummies(combined_data['NamePrefix'], prefix='NamePrefix')
combined_data = pd.concat([combined_data, NamePrefix_dummies], axis=1)
combined_data.drop('NamePrefix', axis=1, inplace=True)
# le_NamePrefix = preprocessing.LabelEncoder().fit(combined_data.NamePrefix)
# combined_data.NamePrefix = le_NamePrefix.transform(combined_data.NamePrefix)
- 利用已知的数据来预测缺失的年龄,因为年龄缺失数目较多。
from sklearn.ensemble import RandomForestRegressor
### 使用 RandomForestClassifier 填补缺失的年龄属性
def set_missing_ages(df):
# 把已有的数值型特征取出来丢进Random Forest Regressor中
age_df = df[['Age','Fare', 'Parch', 'SibSp', 'Pclass', 'NamePrefix_Mrs', 'NamePrefix_Miss', 'Family_Size_Big','Family_Size_Small',
'NamePrefix_Officer', 'NamePrefix_Royalty', 'NamePrefix_Mr', 'NamePrefix_Master', 'Mother', 'Singleton', 'Family']]
# 乘客分成已知年龄和未知年龄两部分
known_age = age_df[age_df.Age.notnull()].as_matrix()
unknown_age = age_df[age_df.Age.isnull()].as_matrix()
# y即目标年龄
y = known_age[:, 0]
# X即特征属性值
X = known_age[:, 1:]
# fit到RandomForestRegressor之中
rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)
rfr.fit(X, y)
# 用得到的模型进行未知年龄结果预测
predictedAges = rfr.predict(unknown_age[:, 1::])
# 用得到的预测结果填补原缺失数据
df.loc[ (df.Age.isnull()), 'Age' ] = predictedAges
set_missing_ages(combined_data)
- 增加Share_Ticket这一特征,意义是跟别人共享票务的说明不是一个人,按理来说应该得救几率大一点
- Share_ticket_survived的意思是共享票的另外的人有没有存活。
from collections import Counter
Ticket_Num_Dict = Counter(combined_data.Ticket)
combined_data['Share_Ticket'] = combined_data.Ticket.apply(lambda x: 1 if Ticket_Num_Dict[x] >= 2 else 0)
# 处理Ticket 这一组合特征
# Ticket_dummies = pd.get_dummies(combined_data['Ticket'], prefix='Ticket')
# combined_data = pd.concat([combined_data, Ticket_dummies], axis=1)
# combined_data.drop('Ticket', axis=1, inplace=True)
train_data_bak = pd.read_csv(u'E:\\资料书籍\\天池\\泰坦尼克号\\train.csv', sep=',')
share_ticket_survived = []
for index, row in combined_data.iterrows():
if row.Share_Ticket == 1 and len(train_data_bak[train_data_bak.Ticket == row.Ticket].Survived == 1) > 0:
share_ticket_survived.append(1)
else:
share_ticket_survived.append(0)
combined_data['Share_ticket_survived'] = np.array(share_ticket_survived)
combined_data.Cabin.fillna('M0', inplace=True)
combined_data.drop(['Embarked', 'Name', 'PassengerId'], axis=1, inplace=True)
combined_data.head()
index | Pclass | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Mother | ... | Family_Size_Big | Family_Size_Small | NamePrefix_Master | NamePrefix_Miss | NamePrefix_Mr | NamePrefix_Mrs | NamePrefix_Officer | NamePrefix_Royalty | Share_Ticket | Share_ticket_survived | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 3 | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | M0 | 0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0 | 0 |
1 | 1 | 1 | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | 0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1 | 1 |
2 | 2 | 3 | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | M0 | 0 | ... | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 |
3 | 3 | 1 | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | 0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 1 | 1 |
4 | 4 | 3 | male | 35.0 | 0 | 0 | 373450 | 8.0500 | M0 | 0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0 | 0 |
5 rows × 22 columns
combined_data.drop(['Ticket'], axis=1, inplace=True)
def process_cabin_num(x):
x = x.split(' ')[0] # 这一步是为了处理有多个Cabin的情况 只取第一个
if len(x) <= 1:
return 0
else:
return float(x[1:])
def process_cabin(combined):
# mapping each Cabin value with the cabin letter
carbin_num = combined.Cabin.apply(process_cabin_num)
combined['Cabin_Numer'] = carbin_num
combined['Cabin'] = combined['Cabin'].map(lambda c : c[0])
process_cabin(combined_data)
# # dummy encoding ...
cabin_dummies = pd.get_dummies(combined_data['Cabin'], prefix='Cabin')
combined_data = pd.concat([combined_data, cabin_dummies], axis=1)
combined_data.drop('Cabin', axis=1, inplace=True)
# le_cabin = preprocessing.LabelEncoder().fit(combined_data.Cabin)
# combined_data.Cabin = le_cabin.transform(combined_data.Cabin)
combined_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 30 columns):
index 1309 non-null int64
Pclass 1309 non-null int64
Sex 1309 non-null object
Age 1309 non-null float64
SibSp 1309 non-null int64
Parch 1309 non-null int64
Fare 1309 non-null float64
Mother 1309 non-null int32
Family 1309 non-null int64
Singleton 1309 non-null int32
Family_Size_Big 1309 non-null float64
Family_Size_Small 1309 non-null float64
NamePrefix_Master 1309 non-null float64
NamePrefix_Miss 1309 non-null float64
NamePrefix_Mr 1309 non-null float64
NamePrefix_Mrs 1309 non-null float64
NamePrefix_Officer 1309 non-null float64
NamePrefix_Royalty 1309 non-null float64
Share_Ticket 1309 non-null int64
Share_ticket_survived 1309 non-null int32
Cabin_Numer 1309 non-null float64
Cabin_A 1309 non-null float64
Cabin_B 1309 non-null float64
Cabin_C 1309 non-null float64
Cabin_D 1309 non-null float64
Cabin_E 1309 non-null float64
Cabin_F 1309 non-null float64
Cabin_G 1309 non-null float64
Cabin_M 1309 non-null float64
Cabin_T 1309 non-null float64
dtypes: float64(20), int32(3), int64(6), object(1)
memory usage: 291.5+ KB
# 处理性别数据
combined_data.Sex = combined_data.Sex.map({'male':0, 'female': 1})
combined_data.drop('index', axis=1, inplace=True)
combined_data.head()
Pclass | Sex | Age | SibSp | Parch | Fare | Mother | Family | Singleton | Family_Size_Big | ... | Cabin_Numer | Cabin_A | Cabin_B | Cabin_C | Cabin_D | Cabin_E | Cabin_F | Cabin_G | Cabin_M | Cabin_T | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3 | 0 | 22.0 | 1 | 0 | 7.2500 | 0 | 1 | 0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
1 | 1 | 1 | 38.0 | 1 | 0 | 71.2833 | 0 | 1 | 0 | 0.0 | ... | 85.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 3 | 1 | 26.0 | 0 | 0 | 7.9250 | 0 | 0 | 1 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
3 | 1 | 1 | 35.0 | 1 | 0 | 53.1000 | 0 | 1 | 0 | 0.0 | ... | 123.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | 3 | 0 | 35.0 | 0 | 0 | 8.0500 | 0 | 0 | 1 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 |
5 rows × 29 columns
- 将两个重要特征PCLASS 和Sex结合
# 建造新特征PCLASS 与 Sex的结合
def join_sex_pclass(sex, pclass):
sex_dict = {0: 'Male', 1: 'Female'}
pclass_dict = {1: 'High', 2: 'Mid', 3: 'Low'}
return '{}_{}'.format(sex_dict[sex], pclass_dict[pclass])
def create_sex_pclass_feat(data):
new_feat = []
for index, row in data.iterrows(): # 获取每行的index、row
new_feat.append(join_sex_pclass(row.Sex, row.Pclass))
return pd.Series(np.array(new_feat))
combined_data['Sex_Pclass'] = create_sex_pclass_feat(combined_data)
# # 处理Sex_Pclass 这一组合特征
sex_pclass_dummies = pd.get_dummies(combined_data['Sex_Pclass'], prefix='Sex_Pclass')
combined_data = pd.concat([combined_data, sex_pclass_dummies], axis=1)
combined_data.drop('Sex_Pclass', axis=1, inplace=True)
# le_sp = preprocessing.LabelEncoder().fit(combined_data.Sex_Pclass)
# combined_data.Sex_Pclass = le_sp.transform(combined_data.Sex_Pclass)
# normalized_Family(combined_data)
combined_data['Child'] = combined_data.Age.apply(lambda x: 1 if x <12 else 0)
combined_data.head()
Pclass | Sex | Age | SibSp | Parch | Fare | Mother | Family | Singleton | Family_Size_Big | ... | Cabin_G | Cabin_M | Cabin_T | Sex_Pclass_Female_High | Sex_Pclass_Female_Low | Sex_Pclass_Female_Mid | Sex_Pclass_Male_High | Sex_Pclass_Male_Low | Sex_Pclass_Male_Mid | Child | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3 | 0 | -0.577952 | 1 | 0 | -0.503291 | 0 | 1 | 0 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0 |
1 | 1 | 1 | 0.599216 | 1 | 0 | 0.734744 | 0 | 1 | 0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 |
2 | 3 | 1 | -0.283660 | 0 | 0 | -0.490240 | 0 | 0 | 1 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 |
3 | 1 | 1 | 0.378497 | 1 | 0 | 0.383183 | 0 | 1 | 0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 |
4 | 3 | 0 | 0.378497 | 0 | 0 | -0.487824 | 0 | 0 | 1 | 0.0 | ... | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0 |
5 rows × 36 columns
- 接下来分析数据
train_data = combined_data.loc[:890]
test_data = combined_data.loc[891:]
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectFromModel
clf = GradientBoostingClassifier(n_estimators=50, max_features='sqrt')
clf = clf.fit(train_data, labels)
model = SelectFromModel(clf, prefit=True)
train_reduced = model.transform(train_data)
test_reduced = model.transform(test_data)
train_reduced.shape
(891L, 11L)
接下来运用stacking集成5个模型进行预测
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.cross_validation import KFold
# Some useful parameters which will come in handy later on
ntrain = train_reduced.shape[0]
ntest = test_reduced.shape[0]
SEED = 0 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction
kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED)
# Class to extend the Sklearn classifier
class SklearnHelper(object):
def __init__(self, clf, seed=0, params=None):
params['random_state'] = seed
self.clf = clf(**params)
def train(self, x_train, y_train):
self.clf.fit(x_train, y_train)
def predict(self, x):
return self.clf.predict(x)
def fit(self,x,y):
return self.clf.fit(x,y)
def feature_importances(self,x,y):
return (self.clf.fit(x,y).feature_importances_)
def get_oof(clf, x_train, y_train, x_test):
oof_train = np.zeros((ntrain,))
oof_test = np.zeros((ntest,))
oof_test_skf = np.empty((NFOLDS, ntest))
for i, (train_index, test_index) in enumerate(kf):
x_tr = x_train[train_index]
y_tr = y_train[train_index]
x_te = x_train[test_index]
clf.train(x_tr, y_tr)
oof_train[test_index] = clf.predict(x_te)
oof_test_skf[i, :] = clf.predict(x_test)
oof_test[:] = oof_test_skf.mean(axis=0)
return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)
# Put in our parameters for said classifiers
# Random Forest parameters
rf_params = {
'n_jobs': -1,
'n_estimators': 500,
'warm_start': True,
#'max_features': 0.2,
'max_depth': 6,
'min_samples_leaf': 2,
'max_features' : 'sqrt',
'verbose': 0
}
# Extra Trees Parameters
et_params = {
'n_jobs': -1,
'n_estimators':500,
#'max_features': 0.5,
'max_depth': 8,
'min_samples_leaf': 2,
'verbose': 0
}
# AdaBoost parameters
ada_params = {
'n_estimators': 500,
'learning_rate' : 0.75
}
# Gradient Boosting parameters
gb_params = {
'n_estimators': 500,
#'max_features': 0.2,
'max_depth': 5,
'min_samples_leaf': 2,
'verbose': 0
}
# Support Vector Classifier parameters
svc_params = {
'kernel' : 'linear',
'C' : 0.025
}
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params)
x_train = train_reduced # Creates an array of the train data
x_test = test_reduced # Creats an array of the test data
y_train = labels.ravel()
# Create our OOF train and test predictions. These base results will be used as new features
et_oof_train, et_oof_test = get_oof(et, x_train, y_train, x_test) # Extra Trees
rf_oof_train, rf_oof_test = get_oof(rf,x_train, y_train, x_test) # Random Forest
ada_oof_train, ada_oof_test = get_oof(ada, x_train, y_train, x_test) # AdaBoost
gb_oof_train, gb_oof_test = get_oof(gb,x_train, y_train, x_test) # Gradient Boost
svc_oof_train, svc_oof_test = get_oof(svc,x_train, y_train, x_test) # Support Vector Classifier
print("Training is complete")
Training is complete
rf_features = rf.feature_importances(x_train,y_train)
et_features = et.feature_importances(x_train, y_train)
ada_features = ada.feature_importances(x_train, y_train)
gb_features = gb.feature_importances(x_train,y_train)
# rf_features = ' 0.10117536 0.19737276 0.0500577 0.01758493 0.14145643 0.06167856\
# 0.09210271 0.06871121 0.23547256 0.00354938 0.01311831 0.01772008'
# et_features = ' 0.10295689 0.31295519 0.03997998 0.02517426 0.06224174 0.05172749\
# 0.06910707 0.04409819 0.22901159 0.00888276 0.01756171 0.03630312'
# ada_features = ' 0.01 0.01 0.026 0. 0.822 0.018 0.028 0.034 0.042 0.006\
# 0.004 0. '
# gb_features = '0.01068577 0.0241477 0.12169894 0.01267324 0.59006613 0.05527318\
# 0.04954347 0.05730464 0.04142735 0.01500765 0.02054994 0.00162199'
# rf_features = map(float, rf_features.strip().split())
# et_features = map(float, et_features.strip().split())
# ada_features = map(float, ada_features.strip().split())
# gb_features = map(float, gb_features.strip().split())
cols = train_data.columns.values
print len(cols)
# Create a dataframe with features
feature_dataframe = pd.DataFrame( {'features': cols,
'Random Forest feature importances': rf_features,
'Extra Trees feature importances': et_features,
'AdaBoost feature importances': ada_features,
'Gradient Boost feature importances': gb_features
})
36
利用plotly库显示各个特征的重要性和平均下来的重要性
- plotly是一个js库,提供python接口,交互很棒,语法很长。。。文档一般般。。。
import plotly.graph_objs as go
import plotly.offline as py
py.init_notebook_mode(connected=True)
# Scatter plot
trace = go.Scatter(
y = feature_dataframe['Random Forest feature importances'].values,
x = feature_dataframe['features'].values,
mode='markers',
marker=dict(
sizemode = 'diameter',
sizeref = 1,
size = 25,
# size= feature_dataframe['AdaBoost feature importances'].values,
#color = np.random.randn(500), #set color equal to a variable
color = feature_dataframe['Random Forest feature importances'].values,
colorscale='Portland',
showscale=True
),
text = feature_dataframe['features'].values
)
data = [trace]
layout= go.Layout(
autosize= True,
title= 'Random Forest Feature Importance',
hovermode= 'closest',
# xaxis= dict(
# title= 'Pop',
# ticklen= 5,
# zeroline= False,
# gridwidth= 2,
# ),
yaxis=dict(
title= 'Feature Importance',
ticklen= 5,
gridwidth= 2
),
showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')
# Scatter plot
trace = go.Scatter(
y = feature_dataframe['Extra Trees feature importances'].values,
x = feature_dataframe['features'].values,
mode='markers',
marker=dict(
sizemode = 'diameter',
sizeref = 1,
size = 25,
# size= feature_dataframe['AdaBoost feature importances'].values,
#color = np.random.randn(500), #set color equal to a variable
color = feature_dataframe['Extra Trees feature importances'].values,
colorscale='Portland',
showscale=True
),
text = feature_dataframe['features'].values
)
data = [trace]
layout= go.Layout(
autosize= True,
title= 'Extra Trees Feature Importance',
hovermode= 'closest',
# xaxis= dict(
# title= 'Pop',
# ticklen= 5,
# zeroline= False,
# gridwidth= 2,
# ),
yaxis=dict(
title= 'Feature Importance',
ticklen= 5,
gridwidth= 2
),
showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')
# Scatter plot
trace = go.Scatter(
y = feature_dataframe['AdaBoost feature importances'].values,
x = feature_dataframe['features'].values,
mode='markers',
marker=dict(
sizemode = 'diameter',
sizeref = 1,
size = 25,
# size= feature_dataframe['AdaBoost feature importances'].values,
#color = np.random.randn(500), #set color equal to a variable
color = feature_dataframe['AdaBoost feature importances'].values,
colorscale='Portland',
showscale=True
),
text = feature_dataframe['features'].values
)
data = [trace]
layout= go.Layout(
autosize= True,
title= 'AdaBoost Feature Importance',
hovermode= 'closest',
# xaxis= dict(
# title= 'Pop',
# ticklen= 5,
# zeroline= False,
# gridwidth= 2,
# ),
yaxis=dict(
title= 'Feature Importance',
ticklen= 5,
gridwidth= 2
),
showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')
# Scatter plot
trace = go.Scatter(
y = feature_dataframe['Gradient Boost feature importances'].values,
x = feature_dataframe['features'].values,
mode='markers',
marker=dict(
sizemode = 'diameter',
sizeref = 1,
size = 25,
# size= feature_dataframe['AdaBoost feature importances'].values,
#color = np.random.randn(500), #set color equal to a variable
color = feature_dataframe['Gradient Boost feature importances'].values,
colorscale='Portland',
showscale=True
),
text = feature_dataframe['features'].values
)
data = [trace]
layout= go.Layout(
autosize= True,
title= 'Gradient Boosting Feature Importance',
hovermode= 'closest',
# xaxis= dict(
# title= 'Pop',
# ticklen= 5,
# zeroline= False,
# gridwidth= 2,
# ),
yaxis=dict(
title= 'Feature Importance',
ticklen= 5,
gridwidth= 2
),
showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')
# Create the new column containing the average of values
feature_dataframe['mean'] = feature_dataframe.mean(axis= 1) # axis = 1 computes the mean row-wise
feature_dataframe.head(3)
y = feature_dataframe['mean'].values
x = feature_dataframe['features'].values
data = [go.Bar(
x= x,
y= y,
width = 0.5,
marker=dict(
color = feature_dataframe['mean'].values,
colorscale='Portland',
showscale=True,
reversescale = False
),
opacity=0.6
)]
layout= go.Layout(
autosize= True,
title= 'Barplots of Mean Feature Importance',
hovermode= 'closest',
# xaxis= dict(
# title= 'Pop',
# ticklen= 5,
# zeroline= False,
# gridwidth= 2,
# ),
yaxis=dict(
title= 'Feature Importance',
ticklen= 5,
gridwidth= 2
),
showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='bar-direct-labels')
base_predictions_train = pd.DataFrame( {'RandomForest': rf_oof_train.ravel(),
'ExtraTrees': et_oof_train.ravel(),
'AdaBoost': ada_oof_train.ravel(),
'GradientBoost': gb_oof_train.ravel()
})
base_predictions_train.head()
AdaBoost | ExtraTrees | GradientBoost | RandomForest | |
---|---|---|---|---|
0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 1.0 | 1.0 | 1.0 | 1.0 |
2 | 1.0 | 1.0 | 0.0 | 0.0 |
3 | 1.0 | 1.0 | 1.0 | 1.0 |
4 | 0.0 | 0.0 | 0.0 | 0.0 |
data = [
go.Heatmap(
z= base_predictions_train.astype(float).corr().values ,
x=base_predictions_train.columns.values,
y= base_predictions_train.columns.values,
colorscale='Portland',
showscale=True,
reversescale = True
)
]
py.iplot(data, filename='labelled-heatmap')
x_train = np.concatenate(( et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train, svc_oof_train), axis=1)
x_test = np.concatenate(( et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test, svc_oof_test), axis=1)
import os
mingw_path = 'C:\Program Files\mingw-w64\x86_64-6.3.0-posix-seh-rt_v5-rev1\mingw64\bin'
os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']
import xgboost as xgb
gbm = xgb.XGBClassifier(
#learning_rate = 0.02,
n_estimators= 2000,
max_depth= 4,
min_child_weight= 2,
#gamma=1,
gamma=0.9,
subsample=0.8,
colsample_bytree=0.8,
objective= 'binary:logistic',
nthread= -1,
scale_pos_weight=1).fit(x_train, y_train)
predictions = gbm.predict(x_test)
C:\Anaconda2\lib\site-packages\xgboost-0.6-py2.7.egg\xgboost\sklearn.py:210: DeprecationWarning:
The nthread parameter is deprecated as of version .6.Please use n_jobs instead.nthread is deprecated.
PassengerId = pd.read_csv(u'E:\\资料书籍\\天池\\泰坦尼克号\\test.csv', sep=',').PassengerId
DataFrame({ 'PassengerId': PassengerId, 'Survived': predictions }).to_csv(u'E:\\资料书籍\\天池\\泰坦尼克号\\2017-8-9-20-25.csv', index=False)
def compute_score(clf, X, y, scoring='accuracy'):
xval = cross_val_score(clf, X, y, cv = 5, scoring=scoring, n_jobs =-1)
return np.mean(xval)
compute_score(gbm, train_reduced, labels)
0.83164671657823797
总结
这次第一次接触kaggle比赛,还是蛮多收获的,从一开始画图分析,利用随机森林撸出了第一个模型,传上去得分有0.76,当时觉得还不错。然后就开始坑了。。。试过了创建不同的特征,也在网上借鉴了很多经验,但是改善程度最好就到0.78…
发现对Fare做离散化效果不是很好,不知道为什么。
最后减少了特征然后上了Stacking集成了5个模型,最后kaggle得分是0.803左右,后续有继续折腾但是改善不大。不过泰坦尼克作为一个入手项目还是很赞的。
最后上个图纪念一下哈哈哈