import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn import datasets
from sklearn import cross_validation
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import f1_score,precision_score,recall_score,accuracy_score,classification_report
导入鸢尾花数据:
def load_data():
iris=datasets.load_iris()
X_train=iris.data
y_train=iris.target
return cross_validation.train_test_split(X_train,y_train,test_size=0.25,random_state=0,stratify=y_train)
def test_DecisionTreeClassifier(*data):
X_train,X_test,y_train,y_test=data
clf=DecisionTreeClassifier()
clf.fit(X_train,y_train)
y_predict=clf.predict(X_test)
print('training:%f'%(clf.score(X_train,y_train)))
print('test:%f'%(clf.score(X_test,y_test)))
print(classification_report(y_test,y_predict))
测试上一段的代码:
X_train,X_test,y_train,y_test=load_data()#产生用于分类问题的数据集
test_DecisionTreeClassifier(X_train,X_test,y_train,y_test)
def test_DecisionTreeClassifier_criterion(*data):
'''
随criterion参数的影响
衡类分类的质量,支持的标准有‘gini’,代表是Gini impurity(不纯度)与‘entropy’代表的是information gain(信息增益)
:param data: 可变参数,他是一个元组
:return 0
'''
X_train,X_test,y_train,y_test=data
criterions=['gini','entropy']
for criterion in criterions:
clf = DecisionTreeClassifier(criterion=criterion)
clf.fit(X_train,y_train)
y_predict=clf.predict(X_test)
print(classification_report(y_test,y_predict))
print('critertion:%s'%criterion)
print('training:%f'%(clf.score(X_train,y_train)))
print('test:%f'%(clf.score(X_test,y_test)))
test_DecisionTreeClassifier_criterion( X_train,X_test,y_train,y_test)
def test_DecisionTreeClassifier_splitter(*data):
'''
测试DecisionTreeClassifier的预测性能随划分类型的影响
splitter:一种用来在节点中选择分类的策略,支持的策略有‘best’,选择最好的分类,‘random’选择最好的随机分类
:param data: 可变参数,他是一个元组
:return 0
'''
X_train,X_test,y_train,y_test=data
splitters=['best','random']
for splitter in splitters:
clf = DecisionTreeClassifier(splitter=splitter)
clf.fit(X_train,y_train)
print('splitter:%s'%splitter)
y_predict=clf.predict(X_test)
print(classification_report(y_test,y_predict))
print('training:%f'%(clf.score(X_train,y_train)))
print('test:%f'%(clf.score(X_test,y_test)))
test_DecisionTreeClassifier_splitter( X_train,X_test,y_train,y_test)
def test_DecisionTreeClassifier_depth(*data,maxdepth):
'''
测试DecisionTreeClassifier 的预测性能随max_depth参数的影响
表示树的最大深度,如果是None,则节点会一直拓展直到所有叶子都是纯的或者所有的叶子节点都包含少于Min_sample_
:param data: 可变参数,他是一个元组
:param maxdepth:一个整数,用于测试DecisionTreeClassifier的max_depth参数
:return 0
'''
X_train,X_test,y_train,y_test=data
depths=np.arange(1,maxdepth)
training_scores=[]
testing_scores=[]
for depth in depths:
clf = DecisionTreeClassifier(max_depth=depth)
clf.fit(X_train,y_train)
training_scores.append(clf.score(X_train,y_train))
testing_scores.append(clf.score(X_test,y_test))
#绘图
fig=plt.figure()
ax=fig.add_subplot(1,1,1)
ax.plot(depths,training_scores,label='training score',marker='o')
ax.plot(depths,testing_scores,label='testing score',marker='*')
ax.set_xlabel('maxdepth')
ax.set_ylabel('score')
ax.set_title('Decision Tree Classification')
ax.legend(framealpha=0.5,loc='best')
plt.show()
test_DecisionTreeClassifier_depth(X_train,X_test,y_train,y_test,maxdepth=10)