哈尔滨理工大学

软件与微电子学院

实 验 报 告

(2020-2021第二学期)

课程名称:数据分析方法
班 级:软件18- 1 班
学 号:1814010130
姓 名:张立辉

哈尔滨理工大学软件与微电子学院


实验名称:实验四 分类分析专 业软件工程
姓 名张立辉学 号1814010130班 级软件18-1

一、实验目的:

通过实验,体会分类过程,深刻理解分类思想;
熟悉和掌握决策树的分类原理、实质和过程,掌握典型的决策树算法和实现技术。
熟悉和掌握贝叶斯分类原理、实质和过程,掌握贝叶斯分类算法和实现技术。

二、实验内容:

根据play或Car Evaluation数据集,使用ID3算法设计创建决策树,并测试决策树的效用。
根据Car Evaluation数据集,使用贝叶斯分类对数据集进行分析。

三、实验设备及软件环境:

Windows10
Python3.8
PyCharm 2020.3.5 (Professional Edition)

四、实验过程及结果:

根据play或Car Evaluation数据集,使用ID3算法设计创建决策树,并测试决策树的效用。

ID3Tree.py

# -*- coding: utf-8 -*-
import operator
from math import log
import pandas as pd
import numpy as np


def majorityCnt(classList):
    classCount = {}
    for vote in classList:  # 统计classList中每个元素出现的次数
        if vote not in classCount.keys():
            classCount[vote] = 0
            classCount[vote] += 1
            sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
            # 根据字典的值降序排序
    return sortedClassCount[0][0]  # 返回classList中出现次数最多的元素


##创建数据集
def createDataSet():
    """
    创建数据集
    """
    data = pd.read_excel(r"C:\Users\zlh\Desktop\数据分析方法实验\实验三、四\play.xlsx")
    # data = pd.read_excel(r"C:\Users\zlh\Desktop\数据分析方法实验\实验三、四\Car Evaluation.xlsx")

    train_data = np.array(data)  # np.ndarray()
    dataSet = train_data.tolist()  # list
    # print(dataSet)

    featureName = ['outlook','teperature','humidity','windy']
    # featureName = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']
    # 返回数据集和每个维度的名称
    return dataSet, featureName


##分割数据集
def splitDataSet(dataSet, axis, value):
    """
    按照给定特征划分数据集
    :param axis:划分数据集的特征的维度
    :param value:特征的值
    :return: 符合该特征的所有实例(并且自动移除掉这维特征)
    """

    # 循环遍历dataSet中的每一行数据
    retDataSet = []
    for featVec in dataSet:
        if featVec[axis] == value:
            reduceFeatVec = featVec[:axis]  # 删除这一维特征
            reduceFeatVec.extend(featVec[axis + 1:])
            retDataSet.append(reduceFeatVec)
    return retDataSet


##计算信息熵
# 计算的始终是类别标签的不确定度
def calcShannonEnt(dataSet):
    """
    计算训练数据集中的Y随机变量的香农熵
    :param dataSet:
    :return:
    """
    numEntries = len(dataSet)  # 实例的个数
    labelCounts = {}
    for featVec in dataSet:  # 遍历每个实例,统计标签的频次
        currentLabel = featVec[-1]  # 表示最后一列
        # 当前标签不在labelCounts map中,就让labelCounts加入该标签
        if currentLabel not in labelCounts.keys():
            labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1

    shannonEnt = 0.0
    for key in labelCounts:
        prob = float(labelCounts[key]) / numEntries
        shannonEnt -= prob * log(prob, 2)  # log base 2
    return shannonEnt


## 计算条件熵
def calcConditionalEntropy(dataSet, i, featList, uniqueVals):
    """
    计算x_i给定的条件下,Y的条件熵
    :param dataSet: 数据集
    :param i: 维度i
    :param featList: 数据集特征列表
    :param unqiueVals: 数据集特征集合
    :return: 条件熵
    """
    ce = 0.0
    for value in uniqueVals:
        subDataSet = splitDataSet(dataSet, i, value)
        prob = len(subDataSet) / float(len(dataSet))  # 极大似然估计概率
        ce += prob * calcShannonEnt(subDataSet)  # ∑pH(Y|X=xi) 条件熵的计算
    return ce


##计算信息增益
def calcInformationGain(dataSet, baseEntropy, i):
    """
    计算信息增益
    :param dataSet: 数据集
    :param baseEntropy: 数据集中Y的信息熵
    :param i: 特征维度i
    :return: 特征i对数据集的信息增益g(dataSet | X_i)
    """
    featList = [example[i] for example in dataSet]  # 第i维特征列表
    uniqueVals = set(featList)  # 换成集合 - 集合中的每个元素不重复
    newEntropy = calcConditionalEntropy(dataSet, i, featList, uniqueVals)  # 计算条件熵,
    infoGain = baseEntropy - newEntropy  # 信息增益 = 信息熵 - 条件熵
    return infoGain


## 算法框架
def chooseBestFeatureToSplitByID3(dataSet):
    """
    选择最好的数据集划分
    :param dataSet:
    :return:
    """
    numFeatures = len(dataSet[0]) - 1  # 最后一列是分类
    baseEntropy = calcShannonEnt(dataSet)  # 返回整个数据集的信息熵
    bestInfoGain = 0.0
    bestFeature = -1
    for i in range(numFeatures):  # 遍历所有维度特征
        infoGain = calcInformationGain(dataSet, baseEntropy, i)  # 返回具体特征的信息增益
        if (infoGain > bestInfoGain):
            bestInfoGain = infoGain
            bestFeature = i
    return bestFeature  # 返回最佳特征对应的维度


def createTree(dataSet, featureName, chooseBestFeatureToSplitFunc=chooseBestFeatureToSplitByID3):
    """
    创建决策树
    :param dataSet: 数据集
    :param featureName: 数据集每一维的名称
    :return: 决策树
    """
    classList = [example[-1] for example in dataSet]  # 类别列表
    if classList.count(classList[0]) == len(classList):  # 统计属于列别classList[0]的个数
        return classList[0]  # 当类别完全相同则停止继续划分
    if len(dataSet[0]) == 1:  # 当只有一个特征的时候,遍历所有实例返回出现次数最多的类别
        return majorityCnt(classList)  # 返回类别标签
    bestFeat = chooseBestFeatureToSplitFunc(dataSet)  # 最佳特征对应的索引
    bestFeatLabel = featureName[bestFeat]  # 最佳特征
    myTree = {bestFeatLabel: {}}  # map 结构,且key为featureLabel
    del (featureName[bestFeat])
    # 找到需要分类的特征子集
    featValues = [example[bestFeat] for example in dataSet]
    uniqueVals = set(featValues)
    for value in uniqueVals:
        subLabels = featureName[:]  # 复制操作
        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels)
    return myTree


# 测试决策树的构建
dataSet, featureName = createDataSet()
myTree = createTree(dataSet, featureName)
print(myTree)

treePlotter.py

# -*- coding: utf-8 -*-

import matplotlib.pyplot as plt

# 定义文本框和箭头格式
decisionNode = dict(boxstyle="round4", color='#3366FF')  # 定义判断结点形态
leafNode = dict(boxstyle="circle", color='#FF6633')  # 定义叶结点形态
arrow_args = dict(arrowstyle="<-", color='g')  # 定义箭头


# 绘制带箭头的注释
def plotNode(nodeTxt, centerPt, parentPt, nodeType):
    createPlot.ax1.annotate(nodeTxt, xy=parentPt, xycoords='axes fraction',
                            xytext=centerPt, textcoords='axes fraction',
                            va="center", ha="center", bbox=nodeType, arrowprops=arrow_args)


# 计算叶结点数
def getNumLeafs(myTree):
    numLeafs = 0
    firstStr = list(myTree.keys())[0]
    secondDict = myTree[firstStr]
    for key in secondDict.keys():
        if type(secondDict[key]).__name__ == 'dict':
            numLeafs += getNumLeafs(secondDict[key])
        else:
            numLeafs += 1
    return numLeafs


# 计算树的层数
def getTreeDepth(myTree):
    maxDepth = 0
    firstStr = list(myTree.keys())[0]
    secondDict = myTree[firstStr]
    for key in secondDict.keys():
        if type(secondDict[key]).__name__ == 'dict':
            thisDepth = 1 + getTreeDepth(secondDict[key])
        else:
            thisDepth = 1
        if thisDepth > maxDepth:
            maxDepth = thisDepth
    return maxDepth


# 在父子结点间填充文本信息
def plotMidText(cntrPt, parentPt, txtString):
    xMid = (parentPt[0] - cntrPt[0]) / 2.0 + cntrPt[0]
    yMid = (parentPt[1] - cntrPt[1]) / 2.0 + cntrPt[1]
    createPlot.ax1.text(xMid, yMid, txtString, va="center", ha="center", rotation=30)


def plotTree(myTree, parentPt, nodeTxt):
    numLeafs = getNumLeafs(myTree)
    depth = getTreeDepth(myTree)
    firstStr = list(myTree.keys())[0]
    cntrPt = (plotTree.xOff + (1.0 + float(numLeafs)) / 2.0 / plotTree.totalW, plotTree.yOff)
    plotMidText(cntrPt, parentPt, nodeTxt)  # 在父子结点间填充文本信息
    plotNode(firstStr, cntrPt, parentPt, decisionNode)  # 绘制带箭头的注释
    secondDict = myTree[firstStr]
    plotTree.yOff = plotTree.yOff - 1.0 / plotTree.totalD
    for key in secondDict.keys():
        if type(secondDict[key]).__name__ == 'dict':
            plotTree(secondDict[key], cntrPt, str(key))
        else:
            plotTree.xOff = plotTree.xOff + 1.0 / plotTree.totalW
            plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode)
            plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key))
    plotTree.yOff = plotTree.yOff + 1.0 / plotTree.totalD


def createPlot(inTree):
    fig = plt.figure(1, facecolor='white')
    fig.clf()
    axprops = dict(xticks=[], yticks=[])
    createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)
    plotTree.totalW = float(getNumLeafs(inTree))
    plotTree.totalD = float(getTreeDepth(inTree))
    plotTree.xOff = -0.5 / plotTree.totalW
    plotTree.yOff = 1.0
    plotTree(inTree, (0.5, 1.0), '')
    plt.show()

main.py

# -*- coding: utf-8 -*-

from pylab import *
import treePlotter
from ID3Tree import *
# mpl.rcParams['font.sans-serif'] = ['SimHei']  # 指定默认字体
# mpl.rcParams['axes.unicode_minus'] = False  # 解决保存图像时负号'-'显示为方块的问题

# 测试决策树的构建
myDat, labels = createDataSet()
myTree = createTree(myDat, labels)
# 绘制决策树

treePlotter.createPlot(myTree)

运行结果:

play.xlsx:

Car Evaluation.xlsx:

根据Car Evaluation数据集,使用贝叶斯分类对数据集进行分析。

Naive_bayes.py

import numpy as np


class Naive_bayes:
    '''
    我们需要计算先验概率,类条件密度概率,封装参数为标签和特征。
    '''
    num = 0
    feature_cat = 0
    label_cat = 0

    def __init__(self):
        pass

    def NaiveBayes(self, Py, Px_y, x):
        featrueNum = self.feature_cat
        classNum = self.label_cat
        # 建立存放所有标记的估计概率数组
        P = [0] * classNum
        # 对于每一个类别,单独估计其概率
        for i in range(classNum):
            # 初始化sum为0,sum为求和项。
            # 在训练过程中对概率进行了log处理,所以这里原先应当是连乘所有概率,最后比较哪个概率最大
            # 但是当使用log处理时,连乘变成了累加,所以使用sum
            sum = 0
            for j in range(featrueNum):
                if x[j] in Px_y[i][j]:
                    sum += Px_y[i][j][x[j]]
            P[i] = sum + Py[i]
        return P.index(max(P))

    def cost_NaiveBayes(self, Py, Px_y, x, cost):
        featrueNum = self.feature_cat
        classNum = self.label_cat
        # 建立存放所有标记的估计概率数组
        P = [0] * classNum
        P_ = [0] * classNum
        # 对于每一个类别,单独估计其概率
        for i in range(classNum):
            # 初始化sum为0,sum为求和项。
            # 在训练过程中对概率进行了log处理,所以这里原先应当是连乘所有概率,最后比较哪个概率最大
            # 但是当使用log处理时,连乘变成了累加,所以使用sum
            sum = 0
            for j in range(featrueNum):
                if x[j] in Px_y[i][j]:
                    sum += Px_y[i][j][x[j]]
            P[i] = sum + Py[i]
        for m in range(classNum):
            totall = 0
            for n in range(classNum):
                totall += P[n] * cost[m][n]
            P_[m] = totall
        return P_.index(min(P_))

    # def Naive_test(self, Py, Px_y, test_data, test_label):
    #     # 错误值计数
    #     errorCnt = 0
    #     # 循环遍历测试集中的每一个样本
    #     for i in range(len(test_data)):
    #         # 获取预测值
    #
    #         presict = self.NaiveBayes(Py, Px_y, test_data[i])
    #         # 与答案进行比较
    #         print("presict", presict)
    #         if presict != test_label[i]:
    #             # 若错误  错误值计数加1
    #             errorCnt += 1
    #     # 返回准确率
    #     return 1 - (errorCnt / len(test_data))

    def Naive_test(self, Py, Px_y, test_data, test_label):
        # 错误值计数
        n1 = 0
        n2 = 0
        n3 = 0
        n0 = 0
        # 循环遍历测试集中的每一个样本
        for i in range(len(test_data)):
            # 获取预测值
            presict = self.NaiveBayes(Py, Px_y, test_data[i])
            # 与答案进行比较

            if test_label[i] == 0:
                n0 += 1
            elif test_label[i] == 1:
                n1 += 1
            elif test_label[i] == 2:
                n2 += 1
            elif test_label[i] == 3:
                n3 += 1

        # 返回准确率
        return n0 / len(test_data), n1 / len(test_data), n2 / len(test_data), n3 / len(test_data)

    def cost_Naive_test(self, Py, Px_y, test_data, test_label, cost):
        # 错误值计数
        errorCnt = 0
        # 循环遍历测试集中的每一个样本
        for i in range(len(test_data)):
            # 获取预测值
            presict = self.cost_NaiveBayes(Py, Px_y, test_data[i], cost)
            # 与答案进行比较

            if presict != test_label[i]:
                # 若错误  错误值计数加1
                errorCnt += 1
        # 返回准确率
        return 1 - (errorCnt / len(test_data))

    def fit(self, train_data, train_label):
        featureNum = train_data.shape[1]
        self.feature_cat = featureNum
        label = set(train_label)
        self.label_cat = len(label)
        classNum = len(label)
        Py = np.zeros((classNum, 1))
        # 计算先验概率分布
        label_dic = {}
        for i in label:
            # 若训练集中没有某一类的数据则其预测概率为零。加一保证不为零,还要同时保证分母不为零 确保预测概率不为零
            label_dic[i] = ((np.sum(train_label == i)) + 1)
            Py[int(i)] = (label_dic[i]) / (len(train_label) + classNum)
        # 转换为log对数形式,防止数据下溢
        Py = np.log(Py)
        # 初始化为全0矩阵,用于存放所有情况下的条件概率
        Px_y = {}
        for i in range(classNum):
            Px_y[i] = {}
            for j in range(featureNum):
                Px_y[i][j] = {}
        for m in range(len(train_label)):
            label = train_label[m]
            x = train_data[m]
            for n in range(featureNum):
                # 这里还没有计算条件概率,先把所有数累加,全加完以后,在后续步骤中再求对应的条件概率
                if x[n] not in Px_y[label][n]:
                    Px_y[label][n][x[n]] = 1
                else:
                    Px_y[label][n][x[n]] += 1
        for label in range(classNum):
            for z in range(featureNum):
                l = len(Px_y[label][z].keys())
                for key, item in Px_y[label][z].items():
                    Px_y[label][z][key] = np.log((item + 1) / (label_dic[label]) + l)
        # 返回先验概率分布和条件概率分布
        return Py, Px_y

main.py

import Naive_bayes

from sklearn.model_selection import RepeatedKFold
from sklearn import preprocessing
import pandas as pd
import numpy as np

if __name__ == "__main__":
    df = pd.read_excel(r"C:\Users\zlh\Desktop\数据分析方法实验\实验三、四\test.xlsx")
    raw_set = df.values
    label_encoder = []
    #  放置每一列的encoder
    encoded_set = np.empty(raw_set.shape)
    for i, _ in enumerate(raw_set[0]):
        # 拟合每一列上的数据
        encoder = preprocessing.LabelEncoder()
        encoded_set[:, i] = encoder.fit_transform(raw_set[:, i])
        label_encoder.append(encoder)
    dataset_X = encoded_set[:, :-1].astype(int)
    dataset_y = encoded_set[:, -1].astype(int)
    #  将数据集拆分为train set 和test set      start = time.time()
    naive_bys = Naive_bayes.Naive_bayes()
    # 使用习得的先验概率分布和条件概率分布对测试集进行测试
    kf = RepeatedKFold(n_splits=10)
    n0 = 0
    n1 = 0
    n2 = 0
    n3 = 0
    Accuracy0 = 0
    Accuracy1 = 0
    Accuracy2 = 0
    Accuracy3 = 0
    for train_index, test_index in kf.split(dataset_X):
        train_X, train_y = dataset_X[train_index], dataset_y[train_index]
        test_X, test_y = dataset_X[test_index], dataset_y[test_index]
        Py, Px_y = naive_bys.fit(train_X, train_y)
        n0, n1, n2, n3 = naive_bys.Naive_test(Py, Px_y, test_X, test_y)
        Accuracy0 += n0
        Accuracy1 += n1
        Accuracy2 += n2
        Accuracy3 += n3
        # print(naive_bys.Naive_test(Py, Px_y, test_X, test_y))
    print("class  \t  N\t\t\tN[%]")
    print('acc:   \t', np.sum(dataset_y == 0), '  \t%f' % Accuracy0, "%")
    print('good:  \t', np.sum(dataset_y == 1), '   \t%f' % Accuracy1, "%")
    print('unacc: \t', np.sum(dataset_y == 2), ' \t%f' % Accuracy2, "%")
    print('v-good:\t', np.sum(dataset_y == 3), '   \t%f' % Accuracy3, "%")
    print("这里Word里面有个错误,总数据行数为1576,而1210+384+69+65=",1210+384+69+65)
    print("所以N[%]也不一定是对的,而我的N[%]相加为",Accuracy0 + Accuracy1 + Accuracy2 + Accuracy3)
    print("近似100%(由于int转flot或flot转int时取舍导致的误差)")

运行结果:(Car Evaluation.xlsx 中doors 和persons 中的5more,more会导致int和str报错,所以我把5more改成6,more改成8)

五、总结:

通过实验,体会分类过程,深刻理解分类思想。熟悉和掌握决策树的分类原理、实质和过程,掌握典型的决策树算法和实现技术。熟悉和掌握贝叶斯分类原理、实质和过程,掌握贝叶斯分类算法和实现技术。

实验成绩: 指导教师: 年 月 日

附:

Car Evaluation.xlsx

play.xlsx

test.xlsx

最后修改:2021 年 05 月 11 日
如果觉得我的文章对你有用,请随意赞赏