1、导入需要的包
# 导入包
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn import metrics
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
2、导入已整理好的数据
# 导入数据
org_data = pd.read_csv("org_data.csv", encoding = 'gbk')
print(org_data.shape)
(4754, 58)
查看变量总数及变量名称
var_total = org_data.columns
print(len(var_total))
print(var_total)
58
Index(['trans_fail_top_count_enum_last_1_month', 'latest_one_month_suc',
'rank_trad_1_month', 'top_trans_count_last_1_month',
'avg_price_top_last_12_valid_month', 'query_cash_count',
'consfin_product_count', 'consfin_org_count_current',
'consfin_org_count_behavior', 'history_suc_fee',
'loans_latest_time_days', 'max_cumulative_consume_later_1_month',
'trans_day_last_12_month', 'query_org_count',
'number_of_trans_from_2011', 'trans_amount_3_month',
'trans_top_time_last_1_month', 'loans_org_count_current', 'loans_score',
'loans_cash_count', 'pawns_auctions_trusts_consume_last_6_month',
'apply_score', 'query_sum_count', 'consume_top_time_last_6_month',
'apply_credibility', 'latest_one_month_apply', 'loans_product_count',
'trans_top_time_last_6_month', 'loans_org_count_behavior',
'loans_credit_limit', 'history_fail_fee', 'loans_latest_day',
'middle_volume_percent', 'consume_mini_time_last_1_month',
'first_transaction_day', 'loans_count', 'consfin_avg_limit',
'trans_fail_top_count_enum_last_12_month', 'latest_one_month_fail',
'trans_fail_top_count_enum_last_6_month', 'latest_six_month_loan',
'low_volume_percent', 'loans_settle_count', 'trans_activity_day',
'historical_trans_day', 'latest_query_time_days', 'loans_max_limit',
'loans_overdue_count', 'consume_top_time_last_1_month',
'historical_trans_amount', 'latest_query_day',
'avg_price_last_12_month', 'pawns_auctions_trusts_consume_last_1_month',
'consfin_credibility', 'consfin_max_limit', 'consfin_credit_limit',
'latest_three_month_loan', 'status'],
dtype='object')
3、生成因变量和自变量列表
var_y = ['status']
var_x = list(set(var_total) - set(var_y))
4、拆分训练集和验证集
# # 数据拆分
y = org_data[var_y]
x = org_data[var_x]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=2018)
print(x.shape)
print(y.shape)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)
(4754, 57)
(4754, 1)
(3327, 57)
(3327, 1)
(1427, 57)
(1427, 1)
5、模型实例化
# 模型实例化
# 逻辑回归、svm和决策树;随机森林和XGBoost
Lr = LogisticRegression()
svc = SVC()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
xgb = XGBClassifier()
model_dict = {"逻辑回归":Lr, "SVM":svc, "决策树":dt, "随机森林":rf, "XGBoost":xgb}
results = pd.DataFrame()
for name, model in model_dict.items():
print(name)
逻辑回归
SVM
决策树
随机森林
XGBoost
6、进行模型训练,打印模型预测效果
# 进行模型训练,打印模型评估指标
results = pd.DataFrame()
# name_list = f1_score = accuracy = []
def model_est(model_dict, x_train, x_test, y_train, y_test):
for name, model in model_dict.items():
result = []
model_trian = model.fit(x_train, y_train)
y_pred = model_trian.predict(x_test)
acc = metrics.accuracy_score(y_pred, y_test)
pre = metrics.precision_score(y_pred, y_test)
result.append(acc)
result.append(pre)
results[name] = result
return results
results = pd.DataFrame(model_est(model_dict, x_train, x_test, y_train, y_test)).T
results.columns = ['accuracy_score', 'precision_score']
print(results)
accuracy_score precision_score
逻辑回归 0.787666 0.314763
SVM 0.761738 0.058496
决策树 0.695865 0.459610
随机森林 0.758234 0.236769
XGBoost 0.786265 0.367688