1,数据集下载
!pip install -q tf-nightly # Requires tf 1.13 from __future__ import absolute_import, division, print_function import numpy as np import pandas as pd import tensorflow as tf tf.enable_eager_execution() tf.logging.set_verbosity(tf.logging.ERROR) tf.set_random_seed(123) # Load dataset. dftrain = pd.read_csv('https://storage.googleapis.com/tfbt/titanic_train.csv') dfeval = pd.read_csv('https://storage.googleapis.com/tfbt/titanic_eval.csv') y_train = dftrain.pop('survived') y_eval = dfeval.pop('survived')
2,一系列数据检查
这一条特别
dftrain.sex.value_counts().plot(kind='barh'); (dftrain['class'] .value_counts() .plot(kind='barh'));
3,Create feature columns and input functions,特征列和输入函数
3.1,one-hot-encoding, normalization, and bucketization
3.2,数字型和分类型
fc = tf.feature_columnCATEGORICAL_COLUMNS = ['sex', 'n_siblings_spouses', 'parch', 'class', 'deck', 'embark_town', 'alone']NUMERIC_COLUMNS = ['age', 'fare'] def one_hot_cat_column(feature_name, vocab): return fc.indicator_column( fc.categorical_column_with_vocabulary_list(feature_name, vocab))feature_columns = []for feature_name in CATEGORICAL_COLUMNS: # Need to one-hot encode categorical features. vocabulary = dftrain[feature_name].unique() print(feature_name,vocabulary) feature_columns.append(one_hot_cat_column(feature_name, vocabulary)) print(feature_columns,"\n") for feature_name in NUMERIC_COLUMNS: feature_columns.append(fc.numeric_column(feature_name, dtype=tf.float32))
3.3,view all of the feature column transformations//第0条example
fc.input_layer(dict(example), feature_columns).numpy()
3.4,create the input functions创建输入函数
# Use entire batch since this is such a small dataset.NUM_EXAMPLES = len(y_train)def make_input_fn(X, y, n_epochs=None, shuffle=True): def input_fn(): dataset = tf.data.Dataset.from_tensor_slices((dict(X), y)) if shuffle: dataset = dataset.shuffle(NUM_EXAMPLES) # For training, cycle thru dataset as many times as need (n_epochs=None). dataset = dataset.repeat(n_epochs) # In memory training doesn't use batching. dataset = dataset.batch(NUM_EXAMPLES) return dataset return input_fn# Training and evaluation input functions.train_input_fn = make_input_fn(dftrain, y_train)eval_input_fn = make_input_fn(dfeval, y_eval, shuffle=False, n_epochs=1)
4,Train and evaluate the model训练和评估
4.1,first train a linear classifier (logistic regression model),训练一个二分类模型
linear_est = tf.estimator.LinearClassifier(feature_columns)# Train model.linear_est.train(train_input_fn, max_steps=100)# Evaluation.results = linear_est.evaluate(eval_input_fn)print('Accuracy : ', results['accuracy'])print('Dummy model: ', results['accuracy_baseline'])
4.2,train a Boosted Trees model
# Since data fits into memory, use entire dataset per layer. It will be faster.# Above one batch is defined as the entire dataset. n_batches = 1est = tf.estimator.BoostedTreesClassifier(feature_columns,n_batches_per_layer=n_batches)
//est = tf.estimator.BoostedTreesClassifier(feature_columns,n_batches_per_layer=n_batches,n_trees=300)# The model will stop training once the specified number of trees is built, not # based on the number of steps.est.train(train_input_fn, max_steps=100)# Eval.results = est.evaluate(eval_input_fn)print('Accuracy : ', results['accuracy'])print('Dummy model: ', results['accuracy_baseline'])
4.3,内存模式
def make_inmemory_train_input_fn(X, y): def input_fn(): return dict(X), y return input_fntrain_input_fn = make_inmemory_train_input_fn(dftrain, y_train)eval_input_fn = make_input_fn(dfeval, y_eval, shuffle=False, n_epochs=1)est = tf.contrib.estimator.boosted_trees_classifier_train_in_memory( train_input_fn, feature_columns)print(est.evaluate(eval_input_fn)['accuracy'])
5,预测
pred_dicts = list(est.predict(eval_input_fn))probs = pd.Series([pred['probabilities'][1] for pred in pred_dicts])probs.plot(kind='hist', bins=20, title='predicted probabilities');
6,看看roc曲线
from sklearn.metrics import roc_curvefrom matplotlib import pyplot as pltfpr, tpr, _ = roc_curve(y_eval, probs)plt.plot(fpr, tpr)plt.title('ROC curve')plt.xlabel('false positive rate')plt.ylabel('true positive rate')plt.xlim(0,)plt.ylim(0,);