# 头部引入
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
import missingno as mso
import pandas_profiling
# 前期设置
pd.set_option("display.max_columns",100)
# 数据读取
data_init=pd.read_csv("训练数据.csv",encoding="gbk")
# 预处理
# 1.查看单个值得数量
data_filer["BufferCounter"].unique()
# 2.查看所有列
list(data_filer.columns)
#数据类型转换
data_filer[[ 'RamUsage', 'CpuUsage', 'VideoTotleTraffic']]=data_filer[['RamUsage','VideoTotleTraffic']].apply(pd.to_numeric,errors="ignore")
# 删除列
data_filer=data_filer.dropna(subset=["Latitude"])
# 缺失值处理
ms.matrix(data_filer)
RX=np.mean(data_filer["RX"])
data_filer["RX"].fillna(RX,inplace=True)
# 各个属性之间的关系
data_filer.corr()
# one-hot编码
X_City=pd.get_dummies(X["City"])
X=pd.concat([X,X_City],axis=1)
X=X.drop(["City"],axis=1)
# 时间处理
data_filer["VideoTestTime"]=data_filer["VideoTestTime"].astype(np.datetime64)
X["year"]=X["VideoTestTime"].apply(lambda x:x.year)
X["month"]=X["VideoTestTime"].apply(lambda x:x.month)
X["Day"]=X["VideoTestTime"].apply(lambda x:x.day)
X["hour"]=X["VideoTestTime"].apply(lambda x:x.hour)
X["minute"]=X["VideoTestTime"].apply(lambda x:x.minute)
X_data=X.drop(["VideoTestTime"],axis=1)
# 单独列的处理
# 排序
data2=data_base[data_base["p_date"]==data_base["dateBefore"]].sort_values(by="enodebid")
import re
def f(x):
try:
str=re.search("[a-zA-Z]+\s",x)
if str:
return str.group()
else:
str2=re.search("[a-zA-Z]+",x)
if str2:
return str2.group()
else:
return "other2"
except:
return "other"
data_pho["PhoneTypenew"]=data_pho["PhoneType"].apply(f)
# Age列处理
#处理Age列以及Series的逐个访问
Age_Pre=data[["Age","NameTitle"]].groupby("NameTitle")["Age"].mean()
type(Age_Pre)
for index,value in Age_Pre.items():
data.loc[(data.Age.isnull())&(data.NameTitle==index),"Age"]=Age_Pre[index]
# Mapping Age
dataset.loc[ dataset['Age'] <= 16, 'Age']= 0
dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
dataset.loc[ dataset['Age'] > 64, 'Age'] = 4 ;
sex={"male":0,"female":1}
dataset["Sex"]=dataset["Sex"].map(sex)
# 特征工程
players['birth_date'] = pd.to_datetime(players.birthday, format='%d.%m.%Y')
players['age_years'] = ((pd.to_datetime("2013-01-01") - players['birth_date']).dt.days)/365.25
players['age_years']
#对离散category数值进行处理Create higher level categories
position_types = players.position.unique()
position_types
“”“
array(['Center Back', 'Attacking Midfielder', 'Right Midfielder',
'Center Midfielder', 'Goalkeeper', 'Defensive Midfielder',
'Left Fullback', nan, 'Left Midfielder', 'Right Fullback',
'Center Forward', 'Left Winger', 'Right Winger'], dtype=object)
”“”
defense = ['Center Back','Defensive Midfielder', 'Left Fullback', 'Right Fullback', ]
midfield = ['Right Midfielder', 'Center Midfielder', 'Left Midfielder',]
forward = ['Attacking Midfielder', 'Left Winger', 'Right Winger', 'Center Forward']
keeper = 'Goalkeeper'
# modifying dataframe -- adding the aggregated position categorical position_agg
players.loc[players['position'].isin(defense), 'position_agg'] = "Defense"
players.loc[players['position'].isin(midfield), 'position_agg'] = "Midfield"
players.loc[players['position'].isin(forward), 'position_agg'] = "Forward"
players.loc[players['position'].eq(keeper), 'position_agg'] = "Keeper"
X=data_filer[['RamUsage', 'CpuUsage', 'Longitude', 'Latitude', 'City', 'Source',
'NetType', 'APN/SSID', 'RX', 'L_SINR', 'LteRsrq', 'CI', 'VideoAvgSpeed',
'VideoPeakSpeed', 'VideoTestTime',
'VideoTotleTraffic']]
y=data_filer["BufferCounter"]
# 可视化
ata_pho["PhoneType"].value_counts()[0:20].plot(kind="bar")
# # 下面针对多个模型进行集成操作
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.kernel_approximation import Nystroem
from sklearn.kernel_approximation import RBFSampler
from sklearn.pipeline import make_pipeline
集成算法
SEED=666
def get_models():
"""Generate a library of base learners."""
nb = GaussianNB()
svc = SVC(C=100, probability=True)
knn = KNeighborsClassifier(n_neighbors=3)
lr = LogisticRegression(C=100, random_state=SEED)
nn = MLPClassifier((80, 10), early_stopping=False, random_state=SEED)
gb = GradientBoostingClassifier(n_estimators=100, random_state=SEED)
rf = RandomForestClassifier(n_estimators=10, max_features=3, random_state=SEED)
models = {'svm': svc,
'knn': knn,
'naive bayes': nb,
'mlp-nn': nn,
'random forest': rf,
'gbm': gb,
'logistic': lr,
}
return models
def train_predict(model_list):
"""Fit models in list on training set and return preds"""
P = np.zeros((y_test.shape[0], len(model_list)))
P = pd.DataFrame(P)
print("Fitting models.")
cols = list()
for i, (name, m) in enumerate(models.items()):
print("%s..." % name, end=" ", flush=False)
m.fit(X_train, y_train)
P.iloc[:, i] = m.predict_proba(X_test)[:, 1]
cols.append(name)
print("done")
P.columns = cols
print("Done.\n")
return P
def score_models(P, y):
"""Score model in prediction DF"""
print("Scoring models.")
for m in P.columns:
score = roc_auc_score(y, P.loc[:, m])
print("%-26s: %.3f" % (m, score))
print("Done.\n")
# 使用前期各个分类器
models = get_models()
P = train_predict(models)
score_models(P, y_test)
# 绘制ROC曲线
from sklearn.metrics import roc_curve
def plot_roc_curve(ytest, P_base_learners, P_ensemble, labels, ens_label):
"""Plot the roc curve for base learners and ensemble."""
plt.figure(figsize=(10, 8))
plt.plot([0, 1], [0, 1], 'k--')
cm = [plt.cm.rainbow(i)
for i in np.linspace(0, 1.0, P_base_learners.shape[1] + 1)]
for i in range(P_base_learners.shape[1]):
p = P_base_learners[:, i]
fpr, tpr, _ = roc_curve(ytest, p)
plt.plot(fpr, tpr, label=labels[i], c=cm[i + 1])
fpr, tpr, _ = roc_curve(ytest, P_ensemble)
plt.plot(fpr, tpr, label=ens_label, c=cm[0])
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(frameon=False)
plt.show()
plot_roc_curve(y_test, P.values, P.mean(axis=1), list(P.columns), "ensemble")
#去掉最差的一个
include = [c for c in P.columns if c not in ["mlp-nn"]]
print("Truncated ensemble ROC-AUC score: %.3f" % roc_auc_score(ytest, P.loc[:, include].mean(axis=1)))
#可视化各个模型预测结果
p = P.apply(lambda x: 1*(x >= 0.5).value_counts(normalize=True))
p.index = ["DEM", "REP"]
p.loc["REP", :].sort_values().plot(kind="bar")
plt.axhline(0.25, color="k", linewidth=0.5)
plt.text(0., 0.23, "True share republicans")
plt.show()
Stacking模型
1.定义基础模型
base_learners = get_models()
2.定义我们的权重分配模型(第二层架构)
meta_learner = GradientBoostingClassifier(
n_estimators=1000,
loss="exponential",
max_features=4,
max_depth=3,
subsample=0.5,
learning_rate=0.005,
random_state=SEED
)
3.将基础模型数据分成两部分,主要供第二层来使用
xtrain_base, xpred_base, ytrain_base, ypred_base = train_test_split(
xtrain, ytrain, test_size=0.5, random_state=SEED)
4.训练我们的基础模型
def train_base_learners(base_learners, inp, out, verbose=True):
"""Train all base learners in the library."""
if verbose: print("Fitting models.")
for i, (name, m) in enumerate(base_learners.items()):
if verbose: print("%s..." % name, end=" ", flush=False)
m.fit(inp, out)
if verbose: print("done")
train_base_learners(base_learners, xtrain_base, ytrain_base)
5.准备二阶段权重分配分类器的训练数据
def predict_base_learners(pred_base_learners, inp, verbose=True):
"""Generate a prediction matrix."""
P = np.zeros((inp.shape[0], len(pred_base_learners)))
if verbose: print("Generating base learner predictions.")
for i, (name, m) in enumerate(pred_base_learners.items()):
if verbose: print("%s..." % name, end=" ", flush=False)
p = m.predict_proba(inp)
# With two classes, need only predictions for one class
P[:, i] = p[:, 1]
if verbose: print("done")
return P
P_base = predict_base_learners(base_learners, xpred_base)
6.训练二阶段得出分类结果!
meta_learner.fit(P_base, ypred_base)
def ensemble_predict(base_learners, meta_learner, inp, verbose=True):
"""Generate predictions from the ensemble."""
P_pred = predict_base_learners(base_learners, inp, verbose=verbose)
return P_pred, meta_learner.predict_proba(P_pred)[:, 1]
P_pred, p = ensemble_predict(base_learners, meta_learner, xtest)
print("\nEnsemble ROC-AUC score: %.3f" % roc_auc_score(ytest, p))
经过上面的操作,损失了一部分数据集,下面采用交叉验证。
from sklearn.base import clone
def stacking(base_learners, meta_learner, X, y, generator):
"""Simple training routine for stacking."""
# Train final base learners for test time
print("Fitting final base learners...", end="")
train_base_learners(base_learners, X, y, verbose=False)
print("done")
# Generate predictions for training meta learners
# Outer loop:
print("Generating cross-validated predictions...")
cv_preds, cv_y = [], []
for i, (train_idx, test_idx) in enumerate(generator.split(X)):
fold_xtrain, fold_ytrain = X[train_idx, :], y[train_idx]
fold_xtest, fold_ytest = X[test_idx, :], y[test_idx]
# Inner loop: step 4 and 5
fold_base_learners = {name: clone(model)
for name, model in base_learners.items()}
train_base_learners(
fold_base_learners, fold_xtrain, fold_ytrain, verbose=False)
fold_P_base = predict_base_learners(
fold_base_learners, fold_xtest, verbose=False)
cv_preds.append(fold_P_base)
cv_y.append(fold_ytest)
print("Fold %i done" % (i + 1))
print("CV-predictions done")
# Be careful to get rows in the right order
cv_preds = np.vstack(cv_preds)
cv_y = np.hstack(cv_y)
# Train meta learner
print("Fitting meta learner...", end="")
meta_learner.fit(cv_preds, cv_y)
print("done")
return base_learners, meta_learner
from sklearn.model_selection import KFold
# Train with stacking
cv_base_learners, cv_meta_learner = stacking(
get_models(), clone(meta_learner), xtrain.values, ytrain.values, KFold(2))
P_pred, p = ensemble_predict(cv_base_learners, cv_meta_learner, xtest, verbose=False)
print("\nEnsemble ROC-AUC score: %.3f" % roc_auc_score(ytest, p))
并行方法提供效率:
from mlens.ensemble import SuperLearner
# Instantiate the ensemble with 10 folds
sl = SuperLearner(
folds=10,
random_state=SEED,
verbose=2,
backend="multiprocessing"
)
# Add the base learners and the meta learner
sl.add(list(base_learners.values()), proba=True)
sl.add_meta(meta_learner, proba=True)
# Train the ensemble
sl.fit(xtrain, ytrain)
# Predict the test set
p_sl = sl.predict_proba(xtest)
print("\nSuper Learner ROC-AUC score: %.3f" % roc_auc_score(ytest, p_sl[:, 1]))