Python员工离职数据分析
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
# 数据全显示
pd.set_option('display.max_columns', None)
# 颜色
colors = sns.color_palette()
# 数据精度
pd.set_option('precision', 3)
# 解决中文/小数点显示问题
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
项目名称:IBM员工离职数据分析
数据来源:https://tianchi.aliyun.com/dataset/dataDetail?dataId=77180
data_path = './data/WA_Fn-UseC_-HR-Employee-Attrition.csv'
df = pd.read_csv(data_path)
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
Age 1470 non-null int64
Attrition 1470 non-null object
BusinessTravel 1470 non-null object
DailyRate 1470 non-null int64
Department 1470 non-null object
DistanceFromHome 1470 non-null int64
Education 1470 non-null int64
EducationField 1470 non-null object
EmployeeCount 1470 non-null int64
EmployeeNumber 1470 non-null int64
EnvironmentSatisfaction 1470 non-null int64
Gender 1470 non-null object
HourlyRate 1470 non-null int64
JobInvolvement 1470 non-null int64
JobLevel 1470 non-null int64
JobRole 1470 non-null object
JobSatisfaction 1470 non-null int64
MaritalStatus 1470 non-null object
MonthlyIncome 1470 non-null int64
MonthlyRate 1470 non-null int64
NumCompaniesWorked 1470 non-null int64
Over18 1470 non-null object
OverTime 1470 non-null object
PercentSalaryHike 1470 non-null int64
PerformanceRating 1470 non-null int64
RelationshipSatisfaction 1470 non-null int64
StandardHours 1470 non-null int64
StockOptionLevel 1470 non-null int64
TotalWorkingYears 1470 non-null int64
TrainingTimesLastYear 1470 non-null int64
WorkLifeBalance 1470 non-null int64
YearsAtCompany 1470 non-null int64
YearsInCurrentRole 1470 non-null int64
YearsSinceLastPromotion 1470 non-null int64
YearsWithCurrManager 1470 non-null int64
dtypes: int64(26), object(9)
memory usage: 402.1+ KB
df.describe()
|
Age |
DailyRate |
DistanceFromHome |
Education |
EmployeeCount |
EmployeeNumber |
EnvironmentSatisfaction |
HourlyRate |
JobInvolvement |
JobLevel |
JobSatisfaction |
MonthlyIncome |
MonthlyRate |
NumCompaniesWorked |
PercentSalaryHike |
PerformanceRating |
RelationshipSatisfaction |
StandardHours |
StockOptionLevel |
TotalWorkingYears |
TrainingTimesLastYear |
WorkLifeBalance |
YearsAtCompany |
YearsInCurrentRole |
YearsSinceLastPromotion |
YearsWithCurrManager |
count |
1470.000 |
1470.000 |
1470.000 |
1470.000 |
1470.0 |
1470.000 |
1470.000 |
1470.000 |
1470.000 |
1470.000 |
1470.000 |
1470.000 |
1470.000 |
1470.000 |
1470.00 |
1470.000 |
1470.000 |
1470.0 |
1470.000 |
1470.000 |
1470.000 |
1470.000 |
1470.000 |
1470.000 |
1470.000 |
1470.000 |
mean |
36.924 |
802.486 |
9.193 |
2.913 |
1.0 |
1024.865 |
2.722 |
65.891 |
2.730 |
2.064 |
2.729 |
6502.931 |
14313.103 |
2.693 |
15.21 |
3.154 |
2.712 |
80.0 |
0.794 |
11.280 |
2.799 |
2.761 |
7.008 |
4.229 |
2.188 |
4.123 |
std |
9.135 |
403.509 |
8.107 |
1.024 |
0.0 |
602.024 |
1.093 |
20.329 |
0.712 |
1.107 |
1.103 |
4707.957 |
7117.786 |
2.498 |
3.66 |
0.361 |
1.081 |
0.0 |
0.852 |
7.781 |
1.289 |
0.706 |
6.127 |
3.623 |
3.222 |
3.568 |
min |
18.000 |
102.000 |
1.000 |
1.000 |
1.0 |
1.000 |
1.000 |
30.000 |
1.000 |
1.000 |
1.000 |
1009.000 |
2094.000 |
0.000 |
11.00 |
3.000 |
1.000 |
80.0 |
0.000 |
0.000 |
0.000 |
1.000 |
0.000 |
0.000 |
0.000 |
0.000 |
25% |
30.000 |
465.000 |
2.000 |
2.000 |
1.0 |
491.250 |
2.000 |
48.000 |
2.000 |
1.000 |
2.000 |
2911.000 |
8047.000 |
1.000 |
12.00 |
3.000 |
2.000 |
80.0 |
0.000 |
6.000 |
2.000 |
2.000 |
3.000 |
2.000 |
0.000 |
2.000 |
50% |
36.000 |
802.000 |
7.000 |
3.000 |
1.0 |
1020.500 |
3.000 |
66.000 |
3.000 |
2.000 |
3.000 |
4919.000 |
14235.500 |
2.000 |
14.00 |
3.000 |
3.000 |
80.0 |
1.000 |
10.000 |
3.000 |
3.000 |
5.000 |
3.000 |
1.000 |
3.000 |
75% |
43.000 |
1157.000 |
14.000 |
4.000 |
1.0 |
1555.750 |
4.000 |
83.750 |
3.000 |
3.000 |
4.000 |
8379.000 |
20461.500 |
4.000 |
18.00 |
3.000 |
4.000 |
80.0 |
1.000 |
15.000 |
3.000 |
3.000 |
9.000 |
7.000 |
3.000 |
7.000 |
max |
60.000 |
1499.000 |
29.000 |
5.000 |
1.0 |
2068.000 |
4.000 |
100.000 |
4.000 |
5.000 |
4.000 |
19999.000 |
26999.000 |
9.000 |
25.00 |
4.000 |
4.000 |
80.0 |
3.000 |
40.000 |
6.000 |
4.000 |
40.000 |
18.000 |
15.000 |
17.000 |
# 查看所有数据分布
colnm = df.columns.to_list()
plt.figure(figsize=(35, 25))
for i in range(35):
plt.subplot(5, 7, i+1) # 3行4列 位置是i+1的子图
df[colnm[i]].hist(bins=80, color=colors[1]) # bins 指定显示多少竖条
plt.xlabel(colnm[i], fontsize=13)
plt.ylabel('Frequency')
plt.tight_layout()
print('\n figure 01')
figure 01
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210318173633587.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2JyeXRsZXZzb24=,size_16,color_FFFFFF,t_70#pic_center)
得出结论: 该公司员工平均年龄为36岁 平均工作年限为11年,最长工作年限为40年 月平均收入为6502.93等重要信息
# 公司各部门离职率分析
plt.figure(figsize=(15, 9)) # 图片大小
df1 = pd.crosstab(df['Department'], df['Attrition'])
df1.plot(kind='bar')
plt.legend()
plt.xlabel('部门')
plt.ylabel('人数')
plt.xticks(rotation=0)
plt.title('公司各部门离职率分析')
print('\n figure 02')
figure 02
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210318173715933.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2JyeXRsZXZzb24=,size_16,color_FFFFFF,t_70#pic_center)
得出结论:公司主要由三个部门组成(人力资源,科技部门,销售部门),其中科技部门的总人数最多,相对应的离职人数也是最多的,人力资源部门总人数和离职人数都最少
# 公司离职人数占比分析
labels = ['离职', '在职']
sizes = [sum(df['Attrition'] == 'Yes'), sum(df['Attrition'] == 'No')]
explode = (0, 0.001)
plt.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', shadow=False, startangle=150)
plt.title("公司离职人数占比分析")
print('\n figure 03')
figure 03
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210318173739682.png#pic_center)
得出结论:公司离职率是16%
# 加班与离职率的关系
plt.figure(figsize=(6, 4))
sns.countplot(x='OverTime', hue='Attrition', data=df, color=colors[4])
plt.legend()
plt.xlabel('加班时长')
plt.title('加班与离职率的关系')
print('\n figure 04')
figure 04
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210318173752643.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2JyeXRsZXZzb24=,size_16,color_FFFFFF,t_70#pic_center)
得出结论:加班时间越长,员工的离职率越高,加班是导致离职的重要原因
# 商务出差与离职率的关系
plt.figure(figsize=(6, 4))
sns.countplot(x='BusinessTravel', hue='Attrition', data=df)
plt.legend()
plt.xlabel('商务出差频次')
plt.title('商务出差与离职率的关系')
print('\n figure 05')
figure 05
![在这里插入图片描述](https://img-blog.csdnimg.cn/2021031817381075.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2JyeXRsZXZzb24=,size_16,color_FFFFFF,t_70#pic_center)
得出结论:如果频繁的商务出差,员工的离职率会更高
# 教育程度与离职率的关系
plt.figure(figsize=(6, 4))
sns.countplot(x='Education', hue='Attrition', data=df, color=colors[6])
plt.legend()
plt.xlabel('教育程度')
plt.title('教育程度与离职率的关系')
print('\n figure 06')
figure 06
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210318173823654.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2JyeXRsZXZzb24=,size_16,color_FFFFFF,t_70#pic_center)
得出结论:科技公司大部分学历中偏上,教育程度越高,更有竞争力,离职率也越低
# 性别与离职率的关系
plt.figure(figsize=(6, 4))
sns.countplot(x='Gender', hue='Attrition', data=df, color=colors[8])
plt.legend()
plt.xlabel('性别')
plt.title('性别与离职率的关系')
print('\n figure 07')
figure 07
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210318173836331.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2JyeXRsZXZzb24=,size_16,color_FFFFFF,t_70#pic_center)
得出结论:科技公司男性人数比较多,通过公司人口基数来对比,女性的离职率是要高于男性的
# 婚姻状况与离职率的关系
labels = ['单身', '已婚', '离婚']
sizes = [sum(df['MaritalStatus'] == 'Married'), sum(df['MaritalStatus'] == 'Single'), sum(df['MaritalStatus'] == 'Divorced')]
explode = (0.04, 0, 0.001)
plt.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', shadow=False, startangle=150)
plt.title("婚姻状况与离职率的关系")
print('\n figure 08')
figure 08
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210318173848883.png#pic_center)
得出结论:公司单身离职率最高,离婚后的离职率最低
# 收入与离职率的关系
df_income1 = df[(df['MonthlyIncome'] > 1000) & (df['MonthlyIncome'] < 8000)]
df_income2 = df[(df['MonthlyIncome'] > 8000) & (df['MonthlyIncome'] < 15000)]
df_income3 = df[(df['MonthlyIncome'] > 15000) & (df['MonthlyIncome'] < 20000)]
labels = ['月收入小于8000离职率', '月收入8000-15000离职率', '月收入大于15000离职率']
sizes = [sum(df_income1['Attrition'] == 'Yes'), sum(df['Attrition'] == 'Yes'),
sum(df['Attrition'] == 'Yes')]
explode = (0.04, 0, 0.001)
plt.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', shadow=False, startangle=150)
plt.title("收入与离职率的关系")
print('\n figure 09')
figure 09
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210318173904998.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2JyeXRsZXZzb24=,size_16,color_FFFFFF,t_70#pic_center)
得出结论:公司单身离职率最高,离婚后的离职率最低
# 员工工作过的公司与离职率的关系
df_worked1 = df[(df['NumCompaniesWorked'] > 0) & (df['NumCompaniesWorked'] < 3)]
df_worked2 = df[(df['NumCompaniesWorked'] > 3) & (df['NumCompaniesWorked'] < 6)]
df_worked3 = df[(df['NumCompaniesWorked'] > 6) & (df['NumCompaniesWorked'] < 10)]
x = ['0-3家', '3-6家', '6-10家']
y = [sum(df_worked1['Attrition'] == 'Yes'), sum(df_worked2['Attrition'] == 'Yes'), sum(df_worked3['Attrition'] == 'Yes')]
plt.plot(x, y, color='r')
plt.xlabel('员工工作过公司数量')
plt.ylabel('离职人数')
plt.title("员工工作过的公司与离职率的关系")
print('\n figure 10')
figure 10
![在这里插入图片描述](https://img-blog.csdnimg.cn/20210412100120172.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L2JyeXRsZXZzb24=,size_16,color_FFFFFF,t_70#pic_center)
得出结论:员工工作过的公司越少,离职率越低,说明频繁跳槽的员工稳定性较差,更容易离职。
总结:
1.加班是导致离职最重要的原因。
建议:完善加班制度,可以对员工每月加班的时间进行限制。
加强员工培训,合理安排工作时间的工作和会议安排,提高工作时间的工作效率。
2.差旅次数过多也会导致员工离职。因为员工长期出差没办法兼顾家庭并且工作负担也比较大。
建议:合理安排员工的差旅次数。例如实施轮流差旅措施
3.工作年数也和离职率密切相关。工作年数越长,职位水平越高,离职率越低。
建议:公司招聘时考察员工的稳定性,一般情况下,以前待过的公司越少越好。
4.单身的人离职率较高。
建议:公司多关照单身的人,别让加班耽误了单身青年谈恋爱,相亲的时间。