数据分析关键代码汇总

2023-05-16


# 头部引入
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
import missingno as mso
import pandas_profiling  

# 前期设置
pd.set_option("display.max_columns",100)
# 数据读取
data_init=pd.read_csv("训练数据.csv",encoding="gbk")
# 预处理
# 1.查看单个值得数量
data_filer["BufferCounter"].unique()
# 2.查看所有列
list(data_filer.columns)
#数据类型转换
data_filer[[ 'RamUsage', 'CpuUsage', 'VideoTotleTraffic']]=data_filer[['RamUsage','VideoTotleTraffic']].apply(pd.to_numeric,errors="ignore")
# 删除列
data_filer=data_filer.dropna(subset=["Latitude"])
# 缺失值处理
ms.matrix(data_filer)
RX=np.mean(data_filer["RX"])
data_filer["RX"].fillna(RX,inplace=True)
# 各个属性之间的关系
data_filer.corr()
# one-hot编码
X_City=pd.get_dummies(X["City"])
X=pd.concat([X,X_City],axis=1)
X=X.drop(["City"],axis=1)
# 时间处理
data_filer["VideoTestTime"]=data_filer["VideoTestTime"].astype(np.datetime64)
X["year"]=X["VideoTestTime"].apply(lambda x:x.year)
X["month"]=X["VideoTestTime"].apply(lambda x:x.month)
X["Day"]=X["VideoTestTime"].apply(lambda x:x.day)
X["hour"]=X["VideoTestTime"].apply(lambda x:x.hour)
X["minute"]=X["VideoTestTime"].apply(lambda x:x.minute)
X_data=X.drop(["VideoTestTime"],axis=1)
# 单独列的处理
# 排序
data2=data_base[data_base["p_date"]==data_base["dateBefore"]].sort_values(by="enodebid")
import re
def f(x):
    try:
        str=re.search("[a-zA-Z]+\s",x)
        if str:
            return str.group()
        else:
            str2=re.search("[a-zA-Z]+",x)
            if str2:
                return  str2.group()
            else:
                return "other2"
    except:
        return "other"
data_pho["PhoneTypenew"]=data_pho["PhoneType"].apply(f)
# Age列处理

#处理Age列以及Series的逐个访问
Age_Pre=data[["Age","NameTitle"]].groupby("NameTitle")["Age"].mean()
type(Age_Pre)
for index,value in Age_Pre.items():
    data.loc[(data.Age.isnull())&(data.NameTitle==index),"Age"]=Age_Pre[index]
# Mapping Age
dataset.loc[ dataset['Age'] <= 16, 'Age']= 0
dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
dataset.loc[ dataset['Age'] > 64, 'Age'] = 4 ;
sex={"male":0,"female":1}
dataset["Sex"]=dataset["Sex"].map(sex)


# 特征工程  

players['birth_date'] = pd.to_datetime(players.birthday, format='%d.%m.%Y')
players['age_years'] = ((pd.to_datetime("2013-01-01") - players['birth_date']).dt.days)/365.25
players['age_years']  

 



#对离散category数值进行处理Create higher level categories

position_types = players.position.unique()
position_types
“”“
array(['Center Back', 'Attacking Midfielder', 'Right Midfielder',
       'Center Midfielder', 'Goalkeeper', 'Defensive Midfielder',
       'Left Fullback', nan, 'Left Midfielder', 'Right Fullback',
       'Center Forward', 'Left Winger', 'Right Winger'], dtype=object)
”“”

defense = ['Center Back','Defensive Midfielder', 'Left Fullback', 'Right Fullback', ]
midfield = ['Right Midfielder', 'Center Midfielder', 'Left Midfielder',]
forward = ['Attacking Midfielder', 'Left Winger', 'Right Winger', 'Center Forward']
keeper = 'Goalkeeper'

# modifying dataframe -- adding the aggregated position categorical position_agg
players.loc[players['position'].isin(defense), 'position_agg'] = "Defense"
players.loc[players['position'].isin(midfield), 'position_agg'] = "Midfield"
players.loc[players['position'].isin(forward), 'position_agg'] = "Forward"
players.loc[players['position'].eq(keeper), 'position_agg'] = "Keeper"  

  

 


X=data_filer[['RamUsage', 'CpuUsage', 'Longitude', 'Latitude', 'City', 'Source',
       'NetType', 'APN/SSID', 'RX', 'L_SINR', 'LteRsrq', 'CI', 'VideoAvgSpeed',
       'VideoPeakSpeed', 'VideoTestTime',
       'VideoTotleTraffic']]
y=data_filer["BufferCounter"]
# 可视化
ata_pho["PhoneType"].value_counts()[0:20].plot(kind="bar")  

# # 下面针对多个模型进行集成操作
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.kernel_approximation import Nystroem
from sklearn.kernel_approximation import RBFSampler
from sklearn.pipeline import make_pipeline  

集成算法


SEED=666
def get_models():
    """Generate a library of base learners."""
    nb = GaussianNB()
    svc = SVC(C=100, probability=True)
    knn = KNeighborsClassifier(n_neighbors=3)
    lr = LogisticRegression(C=100, random_state=SEED)
    nn = MLPClassifier((80, 10), early_stopping=False, random_state=SEED)
    gb = GradientBoostingClassifier(n_estimators=100, random_state=SEED)
    rf = RandomForestClassifier(n_estimators=10, max_features=3, random_state=SEED)

    models = {'svm': svc,
              'knn': knn,
              'naive bayes': nb,
              'mlp-nn': nn,
              'random forest': rf,
              'gbm': gb,
              'logistic': lr,
              }

    return models
def train_predict(model_list):
    """Fit models in list on training set and return preds"""
    P = np.zeros((y_test.shape[0], len(model_list)))
    P = pd.DataFrame(P)

    print("Fitting models.")
    cols = list()
    for i, (name, m) in enumerate(models.items()):
        print("%s..." % name, end=" ", flush=False)
        m.fit(X_train, y_train)
        P.iloc[:, i] = m.predict_proba(X_test)[:, 1]
        cols.append(name)
        print("done")

    P.columns = cols
    print("Done.\n")
    return P
def score_models(P, y):
    """Score model in prediction DF"""
    print("Scoring models.")
    for m in P.columns:
        score = roc_auc_score(y, P.loc[:, m])
        print("%-26s: %.3f" % (m, score))
    print("Done.\n")  

# 使用前期各个分类器
models = get_models()
P = train_predict(models)
score_models(P, y_test)  

# 绘制ROC曲线
from sklearn.metrics import roc_curve

def plot_roc_curve(ytest, P_base_learners, P_ensemble, labels, ens_label):
    """Plot the roc curve for base learners and ensemble."""
    plt.figure(figsize=(10, 8))
    plt.plot([0, 1], [0, 1], 'k--')
    
    cm = [plt.cm.rainbow(i)
      for i in np.linspace(0, 1.0, P_base_learners.shape[1] + 1)]
    
    for i in range(P_base_learners.shape[1]):
        p = P_base_learners[:, i]
        fpr, tpr, _ = roc_curve(ytest, p)
        plt.plot(fpr, tpr, label=labels[i], c=cm[i + 1])

    fpr, tpr, _ = roc_curve(ytest, P_ensemble)
    plt.plot(fpr, tpr, label=ens_label, c=cm[0])
        
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')
    plt.legend(frameon=False)
    plt.show()

plot_roc_curve(y_test, P.values, P.mean(axis
=1), list(P.columns), "ensemble")

#去掉最差的一个
include = [c for c in P.columns if c not in ["mlp-nn"]] print("Truncated ensemble ROC-AUC score: %.3f" % roc_auc_score(ytest, P.loc[:, include].mean(axis=1)))

#可视化各个模型预测结果
p = P.apply(lambda x: 1*(x >= 0.5).value_counts(normalize=True))
p.index = ["DEM", "REP"]
p.loc["REP", :].sort_values().plot(kind="bar")
plt.axhline(0.25, color="k", linewidth=0.5)
plt.text(0., 0.23, "True share republicans")
plt.show()  

Stacking模型

1.定义基础模型


base_learners = get_models()  

2.定义我们的权重分配模型(第二层架构)


meta_learner = GradientBoostingClassifier(
    n_estimators=1000,
    loss="exponential",
    max_features=4,
    max_depth=3,
    subsample=0.5,
    learning_rate=0.005, 
    random_state=SEED
)  

 

3.将基础模型数据分成两部分,主要供第二层来使用 


xtrain_base, xpred_base, ytrain_base, ypred_base = train_test_split(
    xtrain, ytrain, test_size=0.5, random_state=SEED)  

 

4.训练我们的基础模型


def train_base_learners(base_learners, inp, out, verbose=True):
    """Train all base learners in the library."""
    if verbose: print("Fitting models.")
    for i, (name, m) in enumerate(base_learners.items()):
        if verbose: print("%s..." % name, end=" ", flush=False)
        m.fit(inp, out)
        if verbose: print("done")  

 


train_base_learners(base_learners, xtrain_base, ytrain_base)  

 

5.准备二阶段权重分配分类器的训练数据


def predict_base_learners(pred_base_learners, inp, verbose=True):
    """Generate a prediction matrix."""
    P = np.zeros((inp.shape[0], len(pred_base_learners)))

    if verbose: print("Generating base learner predictions.")
    for i, (name, m) in enumerate(pred_base_learners.items()):
        if verbose: print("%s..." % name, end=" ", flush=False)
        p = m.predict_proba(inp)
        # With two classes, need only predictions for one class
        P[:, i] = p[:, 1]
        if verbose: print("done")

    return P  

P_base = predict_base_learners(base_learners, xpred_base)  

 

6.训练二阶段得出分类结果!


meta_learner.fit(P_base, ypred_base)  

def ensemble_predict(base_learners, meta_learner, inp, verbose=True):
    """Generate predictions from the ensemble."""
    P_pred = predict_base_learners(base_learners, inp, verbose=verbose)
    return P_pred, meta_learner.predict_proba(P_pred)[:, 1]  

P_pred, p = ensemble_predict(base_learners, meta_learner, xtest)
print("\nEnsemble ROC-AUC score: %.3f" % roc_auc_score(ytest, p))  

 

经过上面的操作,损失了一部分数据集,下面采用交叉验证。


from sklearn.base import clone

def stacking(base_learners, meta_learner, X, y, generator):
    """Simple training routine for stacking."""

    # Train final base learners for test time
    print("Fitting final base learners...", end="")
    train_base_learners(base_learners, X, y, verbose=False)
    print("done")

    # Generate predictions for training meta learners
    # Outer loop:
    print("Generating cross-validated predictions...")
    cv_preds, cv_y = [], []
    for i, (train_idx, test_idx) in enumerate(generator.split(X)):

        fold_xtrain, fold_ytrain = X[train_idx, :], y[train_idx]
        fold_xtest, fold_ytest = X[test_idx, :], y[test_idx]

        # Inner loop: step 4 and 5
        fold_base_learners = {name: clone(model)
                              for name, model in base_learners.items()}
        train_base_learners(
            fold_base_learners, fold_xtrain, fold_ytrain, verbose=False)

        fold_P_base = predict_base_learners(
            fold_base_learners, fold_xtest, verbose=False)

        cv_preds.append(fold_P_base)
        cv_y.append(fold_ytest)
        print("Fold %i done" % (i + 1))

    print("CV-predictions done")
    
    # Be careful to get rows in the right order
    cv_preds = np.vstack(cv_preds)
    cv_y = np.hstack(cv_y)

    # Train meta learner
    print("Fitting meta learner...", end="")
    meta_learner.fit(cv_preds, cv_y)
    print("done")

    return base_learners, meta_learner  

from sklearn.model_selection import KFold

# Train with stacking
cv_base_learners, cv_meta_learner = stacking(
    get_models(), clone(meta_learner), xtrain.values, ytrain.values, KFold(2))

P_pred, p = ensemble_predict(cv_base_learners, cv_meta_learner, xtest, verbose=False)
print("\nEnsemble ROC-AUC score: %.3f" % roc_auc_score(ytest, p))  

 

并行方法提供效率:


from mlens.ensemble import SuperLearner

# Instantiate the ensemble with 10 folds
sl = SuperLearner(
    folds=10,
    random_state=SEED,
    verbose=2,
    backend="multiprocessing"
)

# Add the base learners and the meta learner
sl.add(list(base_learners.values()), proba=True) 
sl.add_meta(meta_learner, proba=True)

# Train the ensemble
sl.fit(xtrain, ytrain)

# Predict the test set
p_sl = sl.predict_proba(xtest)

print("\nSuper Learner ROC-AUC score: %.3f" % roc_auc_score(ytest, p_sl[:, 1]))  

 

转载于:https://www.cnblogs.com/wangzhenghua/p/11240359.html

本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容,请联系:hwhale#tublm.com(使用前将#替换为@)

数据分析关键代码汇总 的相关文章

  • Linux 开启VNCSERVER

    尽管我们可以使用 SSH连接远程通过字符界面来操作Linux xff0c 但是对于更多熟悉图形人来说是很不方便的 xff0c 因此开启Linux的远程桌面还是很有必要的 目前有两种比较流 行的方式 xff1a XDM X display m
  • Maven WAR overlay

    Overlays are used to share common resources across multiple web applications 1 自己项目结构 pom xml 96 src 96 main java 96 com
  • 【K8S学习笔记】Part3:同一Pod中多个容器间使用共享卷进行通信

    本文将展示如何使用共享卷 xff08 Volume xff09 来实现相同Pod中的两个容器间通信 注意 xff1a 本文针对K8S的版本号为v1 9 xff0c 其他版本可能会有少许不同 0x00 准备工作 需要有一个K8S集群 xff0
  • 摄像头PIN脚功能作用

    摄像头PIN脚功能作用 xff0c Camera硬件系统分析 9 f E 43 E2 b N j4 M2 U a q9 A T c amp O amp C x 43 l5 l q lt ignore js op gt 2015 8 19 1
  • mysql数据库基础知识总结

    这里把自己学的mysql数据库的知识总结一下 xff0c 当是给自己复习一遍 xff0c 也是方便以后查询 安装和配置mysql就不说了 xff0c 可以借鉴这位博友的安装过程 https www cnblogs com by330326
  • 软件质量特性及其子特性列表

    软件质量特性及其子特性列表 质量特性 详细 质量子特性 详细 功能性 与一组功能及其指定的性质有关的一组属性 这里的功能是指满足明确或隐含的需求的哪些功能 适合性 与规定任务能否提供一组功能及这组功能的适合程度有关的软件属性 准确性 与能否
  • android ndk开发之 extern "C" 编译出错

    感叹 xff1a 神一般的eclipse xff01 首先 我这么写 extern 34 C 34 void func 代码爆红 xff1a xff08 error expected identifier or before string
  • for(let i in data){}

    let obj 61 o 34 o 34 o1 34 o1 34 span style color 000000 span for let i in obj setTimeout function console log i 转载于 htt
  • flask框架基本使用

    目录 x1f340 前言 x1f340 安装 x1f340 导入 x1f340 Hello World1 flask项目文件目录2 python文件内容示例 x1f340 路由处理 x1f340 配置1 修改app文件名 xff0c 文件夹
  • Hive 系列(四)—— Hive 常用 DDL 操作

    一 Database 1 1 查看数据列表 code show databases code 1 2 使用数据库 code USE database name code 1 3 新建数据库 语法 xff1a code CREATE DATA
  • SQL调用webApi

    有些时候对一些接口的调用放在数据库比放在程序里更好控制 xff0c 我这边用到的场景就是 xff0c 更具状态变化去调用 xff0c 我处理的方法就是放在 触发器里面 xff0c 只要状态数据变化就触发 xff0c 这样 xff0c 我就不
  • 第二种方式读取并显示HDFS中的内容

    1 讀取HDFS内容的java客戶端代碼 xff1a 1 package Hdfs 2 3 import java io InputStream 4 import java net URI 5 6 import org apache had
  • 更新镜像

    更新镜像这一概念 xff0c 会有两个完全不一样的概念 xff0c 需要先说清楚 1 更新FPGA的配置 这种方案对应Xilinx的bit文件下载和Intel Altera 的sof文件下载 xff0c 更新的是FPGA的配置 xff0c
  • Hadoop 单表关联

    前面的实例都是在数据上进行一些简单的处理 xff0c 为进一步的操作打基础 单表关联这个实例要求从给出的数据中寻找到所关心的数据 xff0c 它是对原始数据所包含信息的挖掘 下面进入这个实例 1 实例描述 实例中给出child parent
  • MFC获取字符串长度的5中方法

    char s1 61 34 中文ABC 34 wchar t s2 61 L 34 中文ABC 34 1 sizeof 获取字符数组的字节数 xff08 包括结束符0 xff09 sizeof s1 61 8 ANSI sizeof s2
  • python学习笔记(12)常用模块

    一 模块 包 什么是模块 xff1f 模块实质上就是一个python文件 它是用来组织代码的 xff0c 意思就是说把python代码写到里面 xff0c 文件名就是模块的名称 xff0c test py test就是模块名称 什么是包 x
  • python学习笔记(13)常用模块列表总结

    os模块 os remove 删除文件 os unlink 删除文件 os rename 重命名文件 os listdir 列出指定目录下所有文件 os chdir 改变当前工作目录 os getcwd 获取当前文件路径 os mkdir
  • python获取当前时间的用法

    python获取当前时间的用法 1 先导入库 xff1a import datetime 2 获取当前日期和时间 xff1a now time 61 datetime datetime now 3 格式化成我们想要的日期 xff1a str
  • nmap使用教程

    nmap简介 nmap是一个免费开放的网络扫描和嗅探的工具包 也叫网络映射器 nmap强大之处在于简单 易用 看一下nmap的基本功能 探测一组主机是否在线扫描主机端口 嗅探所提供的网络服务 推断出主机所用的操作系统丰富的脚本功能 wind

随机推荐