特征二值化
请参考《数据准备和特征工程》中的相关章节,调试如下代码。
基础知识
import pandas as pd
pm25 = pd.read_csv("/home/aistudio/data/data20505/pm2.csv")
pm25.head()
|
RANK |
CITY_ID |
CITY_NAME |
Exposed days |
0 |
1 |
594 |
拉萨 |
2 |
1 |
2 |
579 |
玉溪 |
7 |
2 |
3 |
263 |
厦门 |
8 |
3 |
4 |
267 |
泉州 |
9 |
4 |
5 |
271 |
漳州 |
10 |
import numpy as np
# 以平均值作为阈值对特征"Exposed days"进行二值化
pm25['bdays'] = np.where(pm25["Exposed days"] > pm25["Exposed days"].mean(), 1, 0)
pm25.sample(10)
|
RANK |
CITY_ID |
CITY_NAME |
Exposed days |
bdays |
174 |
197 |
598 |
宝鸡 |
128 |
1 |
260 |
283 |
364 |
安阳 |
246 |
1 |
137 |
152 |
183 |
南通 |
103 |
0 |
249 |
272 |
358 |
开封 |
210 |
1 |
54 |
62 |
44 |
朔州 |
58 |
0 |
217 |
240 |
315 |
东营 |
170 |
1 |
86 |
96 |
86 |
抚顺 |
78 |
0 |
77 |
86 |
303 |
上饶 |
71 |
0 |
190 |
213 |
608 |
商洛 |
138 |
1 |
180 |
203 |
438 |
岳阳 |
132 |
1 |
from sklearn.preprocessing import Binarizer
# 创建二值化模型,并用平均值赋值
bn = Binarizer(threshold=pm25["Exposed days"].mean())
# 用得到的模型进行训练并同时实现特征转换,pm25[["Exposed days"]将Series转换为DataFrame对象
print(type(pm25["Exposed days"]))
result = bn.fit_transform(pm25[["Exposed days"]])
pm25['sk-bdays'] = result
pm25.sample(10)
|
RANK |
CITY_ID |
CITY_NAME |
Exposed days |
bdays |
sk-bdays |
92 |
103 |
147 |
双鸭山 |
82 |
0 |
0 |
12 |
13 |
273 |
南平 |
18 |
0 |
0 |
175 |
198 |
545 |
遂宁 |
129 |
1 |
1 |
157 |
179 |
449 |
怀化 |
119 |
1 |
1 |
67 |
75 |
507 |
钦州 |
65 |
0 |
0 |
40 |
45 |
88 |
丹东 |
49 |
0 |
0 |
41 |
46 |
462 |
珠海 |
49 |
0 |
0 |
121 |
133 |
230 |
衢州 |
94 |
0 |
0 |
70 |
78 |
510 |
玉林 |
67 |
0 |
0 |
176 |
199 |
199 |
泰州 |
130 |
1 |
1 |
pm25[["Exposed days"]].shape
(264, 1)
pm25["Exposed days"].shape
(264,)
pm25["Exposed days"].values.reshape((-1, 1)).shape
(264, 1)
from sklearn.preprocessing import binarize
# 训练出的模型fbin可以对其他DataFrame对象进行二值化操作
fbin = binarize(pm25[['Exposed days']], threshold=pm25['Exposed days'].mean())
fbin[[1, 50, 100, 150, 200]]
array([[0],
[0],
[0],
[1],
[1]])
# scale(float):正态分布的标准差,越大曲线越矮胖
gau = np.random.normal(loc=0, scale=1.0, size=100)
gau
array([-0.16138569, -1.12381876, 1.08345071, -0.3374515 , -0.4377176 ,
-0.18485122, -0.47717794, -0.15147513, 1.34975203, -0.06388386,
1.2794776 , -0.67413457, -2.03388881, 1.77891998, 3.45445178,
-0.93258988, 0.39723041, -1.23677885, 1.87841988, -0.48846415,
0.23898558, 0.08322678, 0.50841094, -0.59189042, -0.86218771,
0.13808454, -1.420791 , -0.52815037, 0.37716549, 0.55944191,
-0.81171679, 0.26489442, -0.62432789, -0.43654577, 0.226915 ,
-1.01346821, -1.42727242, -0.45127134, -0.18215018, 1.11537106,
0.30099939, -1.2661621 , -0.11895918, 1.69860201, -0.67702066,
-1.00599679, -0.69448062, -0.94056253, -0.25849202, -0.08299086,
1.48469908, -1.19206442, 0.95998195, -1.800488 , -0.04798554,
-0.77909029, 0.9659936 , -0.38096705, 1.52306246, -2.033555 ,
-1.82498521, -0.4745455 , -0.20866822, 2.01935722, 0.46819346,
0.37152816, -0.20247084, -1.14168624, 1.04413851, 0.98376221,
0.27129983, -0.66495964, -0.99604697, 0.31477433, 1.14606679,
0.92117707, 0.91663896, 0.96625631, -2.00554469, 1.02536304,
-0.63002324, 1.71252177, -0.65706596, -1.33159033, 0.08011075,
1.62804803, -1.63617324, 1.42729399, -2.14112983, 0.95559999,
-0.74515346, 1.29242505, 0.03208948, -0.45625835, 1.24445081,
-1.53939509, 0.40075234, -0.97061926, 0.39624106, -0.14267309])
# Binarizer()的参数默认为0,reshape(-1, 1)中-1表示按照行的方式的获取,结果形成一列
gau_bin = Binarizer().fit_transform(gau.reshape(-1, 1))
gau_bin.reshape(1,-1) [0]
array([0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 1., 1., 0., 1.,
0., 1., 0., 1., 1., 1., 0., 0., 1., 0., 0., 1., 1., 0., 1., 0., 0.,
1., 0., 0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1.,
0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 1., 1., 1., 0., 0.,
1., 1., 1., 0., 0., 1., 1., 1., 1., 1., 0., 1., 0., 1., 0., 0., 1.,
1., 0., 1., 0., 1., 0., 1., 1., 0., 1., 0., 1., 0., 1., 0.])
项目案例
%matplotlib inline
import matplotlib.pyplot as plt
import cv2
# 写一个专门在Jupyter中显示图片的函数
def show_img(img):
if len(img.shape) == 3:
# 分离得到各个通道的灰度值(单通道图像)
b, g, r = cv2.split(img)
# 合并单通道成多通道(不能合并多个多通道图像)。
img = cv2.merge([r, g, b])
plt.imshow(img)
else:
plt.imshow(img, cmap="gray")
plt.axis("off")
plt.show()
laoqi = cv2.imread("work/images/laoqi.png")
show_img(laoqi)
# BGR转GRAY
gray_laoqi = cv2.cvtColor(laoqi, cv2.COLOR_BGR2GRAY)
show_img(gray_laoqi)
# cv2.threshold(img, threshold, maxval,type)
# threshold是设定的阈值127
# maxval是当灰度值大于(或小于)阈值127时将该灰度值赋成的值255
# type规定的是当前二值化的方式 ,cv2.THRESH_BINARY:大于阈值127的部分被置为255,小于部分被置为0
ret,thr = cv2.threshold(gray_laoqi, 127, 255, cv2.THRESH_BINARY)
show_img(thr)
动手练习
pd.read_csv("/home/aistudio/data/data20512/marathon.csv").head()
|
age |
gender |
split |
final |
0 |
33 |
M |
01:05:38 |
02:08:51 |
1 |
32 |
M |
01:06:26 |
02:09:28 |
2 |
31 |
M |
01:06:49 |
02:10:42 |
3 |
38 |
M |
01:06:16 |
02:13:45 |
4 |
31 |
M |
01:06:32 |
02:13:59 |
import datetime
# 将时间换算为datetime.timedelta类型,split代表半马用时
def convert_time(s):
h,m,s = map(int, s.split(":"))
return datetime.timedelta(hours=h, minutes=m, seconds=s)
marathon = pd.read_csv("/home/aistudio/data/data20512/marathon.csv",
converters={"split":convert_time,
"final":convert_time})
print(marathon.dtypes)
marathon.head()
age int64
gender object
split timedelta64[ns]
final timedelta64[ns]
dtype: object
|
age |
gender |
split |
final |
0 |
33 |
M |
0 days 01:05:38 |
0 days 02:08:51 |
1 |
32 |
M |
0 days 01:06:26 |
0 days 02:09:28 |
2 |
31 |
M |
0 days 01:06:49 |
0 days 02:10:42 |
3 |
38 |
M |
0 days 01:06:16 |
0 days 02:13:45 |
4 |
31 |
M |
0 days 01:06:32 |
0 days 02:13:59 |
# 由于转换为int后的存储单位是纳秒,因此需要乘以1e-9
marathon['split'] = marathon['split'].astype(int) * 1e-9
marathon['final'] = marathon['final'].astype(int) * 1e-9
marathon.head()
|
age |
gender |
split |
final |
0 |
33 |
M |
3938.0 |
0.000008 |
1 |
32 |
M |
3986.0 |
0.000008 |
2 |
31 |
M |
4009.0 |
0.000008 |
3 |
38 |
M |
3976.0 |
0.000008 |
4 |
31 |
M |
3992.0 |
0.000008 |
marathon['frac'] = 1 - 2 * marathon['split'] / marathon["final"]
marathon.head()
|
age |
gender |
split |
final |
frac |
0 |
33 |
M |
3938.0 |
7731.0 |
-0.018756 |
1 |
32 |
M |
3986.0 |
7768.0 |
-0.026262 |
2 |
31 |
M |
4009.0 |
7842.0 |
-0.022443 |
3 |
38 |
M |
3976.0 |
8025.0 |
0.009097 |
4 |
31 |
M |
3992.0 |
8039.0 |
0.006842 |
marathon['split_frac'] = np.where(marathon['frac']>0, 0, 1)
marathon.sample(10)
|
age |
gender |
split |
final |
frac |
split_frac |
10199 |
43 |
M |
6524.0 |
14839.0 |
0.120695 |
0 |
9853 |
36 |
W |
6906.0 |
14724.0 |
0.061940 |
0 |
1385 |
39 |
M |
5304.0 |
11185.0 |
0.051587 |
0 |
2923 |
51 |
M |
5957.0 |
12162.0 |
0.020391 |
0 |
35513 |
42 |
M |
9818.0 |
24291.0 |
0.191635 |
0 |
2812 |
32 |
M |
5390.0 |
12090.0 |
0.108354 |
0 |
21482 |
60 |
M |
7353.0 |
17853.0 |
0.176273 |
0 |
31825 |
59 |
M |
8110.0 |
21184.0 |
0.234328 |
0 |
25189 |
38 |
W |
7816.0 |
18852.0 |
0.170804 |
0 |
15959 |
22 |
W |
7438.0 |
16395.0 |
0.092650 |
0 |