泰坦尼克号旅客生存预测:Keras应用实践

  1. 1. 案例分析与数据处理
    1. 1.1. 下载泰坦尼克号上旅客的数据集
    2. 1.2. 筛选提取字段
    3. 1.3. 转换编码
    4. 1.4. 定义数据预处理函数
  2. 2. Keras建模与应用
    1. 2.1. 完整程序
    2. 2.2. 通过TensorBoard可视化查看训练过程

泰坦尼克号上的旅客生存概率预测:TensorFlow的高级框架Keras

关于使用Keras
kears

案例分析与数据处理

下载泰坦尼克号上旅客的数据集

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import urllib.request
import os

data_url="http://biostat.mc. vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls"
data_file_path="data/titanic.xls"

if not os.path.isfile(data_file_path):
result=urllib3.request.urlretrieve(data_url, data_file_path)
print ('downloaded: ' ,result)
else:
print(data_file_path,'data file already exists.')

import numpy
import pandas as pd

#读取数据文件,结果为dataframe格式
df_data = pd.read_excel(data_file_path)
#查看数据摘要
df_data.describe()

筛选提取字段

survival(是否生存)是标签字段,其他是候选特征字段
筛选提取需要的特征字段,去掉ticket,cabin等

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
selected_cols=['survived','name','pclass','sex', 'age' , 'sibsp', 'parch', 'fare', 'embarked']
selected_df_data=df_data[selected_cols]
#找出有null值的字段
#判断哪些列存在缺失值
#列级别的判断,只要该列有为空或者NA的元素,就为True,否则False
selected_df_data.isnull().any()
#判断哪些”列”存在缺失值,将列中为空的个数统计出来
selected_df_data.isnull().sum()
#显示存在缺失值的行列,确定缺失值的位置
selected_df_data[selected_df_data.isnull().values==True]

#为缺失age记录填充值设置为平均值
age_mean_value = selected_df_data['age'].mean()
selected_df_data['age'] = selected_df_data['age'].fillna(age_mean_value)
#为缺失fare记录填充值
fare_mean_value = selected_df_data['fare'].mean()
selected_df_data['fare'] = selected_df_data['fare'].fillna(fare_mean_value)
##为缺失embarked记录填充值
selected_df_data['embarked'] = selected_df_data['embarked'].fillna('S')

转换编码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
#性别sex由字符串转换为数字编码
selected_df_data['sex']= selected_df_data['sex'].map( {'female':0, 'male' : 1} ).astype(int)

# 港口embarked由字母表示转换为数字编码
selected_df_data['embarked']= selected_df_data['embarked'].map({'C':0,'Q': 1,'S': 2}).astype(int)

#drop不改变原有的df中的数据,而是返回另一个DataFrame来存放删除后的数据
#axis = 1 表示删除列
selected_df_data=selected_df_data.drop(['name'], axis=1)

#分离特征值和标签值
#转换为ndarray数组
ndarray_data = selected_df_data.values
#后7列是特征值
features = ndarray_data[:,1:]
#第0列是标签值
label = ndarray_data[:,0]

from sklearn import preprocessing

#特征值标准化
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0,1))
norm_features=minmax_scale.fit_transform(features)

定义数据预处理函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
from sklearn import preprocessing
def prepare_data(df_data):
df=df_data.drop(['name'], axis=1)#删除姓名列
age_mean = df['age' ].mean ()
df['age'] = df['age'].fillna(age_mean)#为缺失age记录填充值
fare_mean = df['fare' ].mean()
df['fare'] = df['fare'].fillna(fare_mean)#为缺失fare记录填充值
df['sex']= df['sex'].map({'female':0, 'male': 1}).astype(int)#把sex值由字符串转换为数值
df['embarked'] = df['embarked'].fillna('S')#为缺失embarked记录填充值
df['embarked']=df['embarked'].map({'C':0,'Q': 1,'S': 2}).astype(int)#把embarked值由字符串转换为数值
ndarray_data = df.values #转换为ndarray数组

features = ndarray_data[:,1:]#后7列是特征值
label = ndarray_data[:,0]#第0列是标签值

#特征值标准化
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0,1))
norm_features=minmax_scale.fit_transform(features)
return norm_features,label

Keras建模与应用

完整程序

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
import ur1lib.request
import os

data_url="http://biostat.mc. vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls"
data_file_path="data/titanic.xls"

if not os.path.isfile(data_file_path):
result=urllib3.request.urlretrieve(data_url, data_file_path)
print ('downloaded: ' ,result)
else:
print(data_file_path,'data file already exists.')

import numpy
import pandas as pd

#读取数据文件,结果为dataframe格式
df_data = pd.read_excel(data_file_path)

selected_cols=['survived','name','pclass','sex', 'age' , 'sibsp', 'parch', 'fare', 'embarked']
selected_df_data=df_data[selected_cols]

from sklearn import preprocessing
def prepare_data(df_data):
df=df_data.drop(['name'], axis=1)#删除姓名列
age_mean = df['age' ].mean ()
df['age'] = df['age'].fillna(age_mean)#为缺失age记录填充值
fare_mean = df['fare' ].mean()
df['fare'] = df['fare'].fillna(fare_mean)#为缺失fare记录填充值
df['sex']= df['sex'].map({'female':0, 'male': 1}).astype(int)#把sex值由字符串转换为数值
df['embarked'] = df['embarked'].fillna('S')#为缺失embarked记录填充值
df['embarked']=df['embarked'].map({'C':0,'Q': 1,'S': 2}).astype(int)#把embarked值由字符串转换为数值
ndarray_data = df.values #转换为ndarray数组

features = ndarray_data[:,1:]#后7列是特征值
label = ndarray_data[:,0]#第0列是标签值

#特征值标准化
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0,1))
norm_features=minmax_scale.fit_transform(features)
return norm_features,label

# shuffle,打乱数据顺序,通过Pandas的抽样函数sample实现,frac为百分比# selected_df_data数据保持不变
shuffled_df_data=selected_df_data.sample(frac=1)

x_data, y_data=prepare_data(shuffled_df_data)

train_size = int(len(x_data)*0.8)
x_train = x_data[:train_size]
y_train = y_data[:train_size]
x_test = x_data[train_size: ]
y_test = y_data[train_size:]

import tensorflow as tf
from tensorflow import keras
#建立Keras序列模型
model = tf.keras.models.Sequential()

#加入第一层,输入特征数据是7列,也可以用input_shape=(7,)
model.add(tf.keras.layers. Dense(units=64,
input_dim=7,
use_bias=True,
kernel_initializer='uniform',
bias_initializer='zeros',
activation='relu'))

model.add(tf. keras.layers. Dense(units=32,
activation='sigmoid'))
model.add(tf.keras.layers. Dense(units=1,
activation='sigmoid'))
model.summary()

model.compile (optimizer=tf.keras. optimizers.Adam(0.003),
loss='binary_crossentropy',
metrics=['accuracy'])

train_history=model.fit(x=x_train,
y=y_train,
validation_split=0.2,
epochs=100,
batch_size=40,
verbose=2)
train_history.history.keys()

#可视化
import matplotlib.pyplot as plt
def visu_train_history(train_history, train_metric, validation_metric):
plt.plot(train_history.history[train_metric])
plt.plot(train_history.history[validation_metric])
plt.title('Train History')
plt.ylabel(train_metric)
plt.xlabel (' epoch')
plt.legend(['train', 'validation'],loc='upper left')
plt.show ()

visu_train_history(train_history,'acc', 'val_acc')

visu_train_history(train_history,'loss' ,'val_loss' )
#模型评估
evaluate_result = model.evaluate(x=x_test,
y=y_test)

evaluate_result
model.metrics_names

#模型预测
selected_cols
# Jack和Rose的旅客信息
Jack_info = [0 ,'Jack',3, 'male',23,1,0,5.0000,'S']
Rose_info = [1 , 'Rose',1,'female', 20,1,0,100.0000,'S']

#创建新的旅客DataFrame
new_passenger_pd=pd.DataFrame([Jack_info,Rose_info], columns=selected_cols)

#在老的DataFrame中加入新的旅客信息
all_passenger_pd=selected_df_data.append(new_passenger_pd)

all_passenger_pd[-3:]

#数据准备
x_features,y_label=prepare_data(all_passenger_pd)

#利用模型计算旅客生存概率
surv_probability=model. predict(x_features)

surv_probability[:5]

#在数据表最后一列插入生存概率
all_passenger_pd.insert(len (all_passenger_pd.columns) , 'surv_probability', surv_probability)

all_passenger_pd[-5:]

#设置回调参数,内置的回调还包括:
#tf.keras. callbacks.LearningRateScheduler()
# tf.keras.callbacks.EarlyStopping
logdir = './logs'
checkpoint_path = './checkpoint/Titanic.{epoch:02d)-(val_loss:.2f}.ckpt'

callbacks = [
tf.keras.callbacks.TensorBoard(log_dir=logdir,
histogram_freq=2),
tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
save_weights_only=True,
verbose=1,
period=5)
]
train_history=model.fit(x=x_train,
y=y_train,
validation_split=0.2,
epochs=100,
batch_size=40,
callbacks=callbacks,
verbose=2)
#从CheckPoint文件中恢复模型
logdir = './logs'
checkpoint_path = './checkpoint/Titanic.{epoch:02d}-(val_loss:.2f} . ckpt'
checkpoint_dir = os. path.dirname(checkpoint_path)

latest = tf.train.latest_checkpoint(checkpoint_dir)
latest

model.load_weights(latest)
loss,acc = model.evaluate(x_test, y_test)
print("Restored model,accuracy: {:5.2f}%".format(100*acc))

val_acc
val_loss

通过TensorBoard可视化查看训练过程

graph