`
tempsitegoogle
  • 浏览: 866545 次
文章分类
社区版块
存档分类
最新评论

R语言学习系列(数据挖掘之决策树算法实现--ID3代码篇)

 
阅读更多

1、辅助类,用于计算过程和结果存储

/// <summary>
    /// 决策树节点.
    /// </summary>
    public class DecisionTreeNode
    {
        /// <summary>
        /// 类型:分支或叶子
        /// </summary>
        public string Type { get; set; }
        /// <summary>
        /// 关键字一般存当前属性因子
        /// </summary>
        public string Key { get; set; }
        /// <summary>
        /// 判断值,叶子节点有效.
        /// </summary>
        public string DecisionValue { get; set; }
        /// <summary>
        /// 前一个属性因子,可以看作是分支条件.
        /// </summary>
        public string ParentFactor { get; set; }
        /// <summary>
        /// 当前节点的样本数量,
        /// </summary>
        public int CalcCount { get; set; }
        /// <summary>
        /// 当前节点的样本索引集合.
        /// </summary>
        public List<int> DataIndexes {get;set;}
        /// <summary>
        /// 分支节点集合.
        /// </summary>
        public Dictionary<string, DecisionTreeNode> Children { get; private set; }
        /// <summary>
        /// 父节点
        /// </summary>
        public DecisionTreeNode Parent { get; set; }
        public DecisionTreeNode()
        {
            DataIndexes = new List<int>();
            Children = new Dictionary<string, DecisionTreeNode>();
        }
       
    }
    /// <summary>
    /// 用于计算过程存放数据.用数组不是很方便,这里采用字典,可以减少循环次数.
    /// </summary>
    public class CalcNode
    {
        public string Key { get; set; }
        public string Type { get; set; }
        public int CalcCount { get; set; }
        public List<int> DataIndexes {get;set;}
        public Dictionary<string, CalcNode> Children { get; private set; }
        public CalcNode()
        {
            DataIndexes = new List<int>();
            Children = new Dictionary<string, CalcNode>();
        }
        public void AddChildren(string Key,string AType,int AIndex, int Count = 1)
        {
            if (Children.ContainsKey(Key) == false)
            {
                Children.Add(Key, new CalcNode());
            }
            Children[Key].Key = Key;
            Children[Key].Type = AType;
            Children[Key].CalcCount += Count;
            Children[Key].DataIndexes.Add(AIndex);
        }
      
    }


2、算法类,注释比较详细,有时间再写一篇原理文章

 /// <summary>
    /// 决策树算法类,不适合连续性值。
    /// </summary>
    public class DecisionTreeAlg
    {
        private string PrefixString = "                                                                                                                                                                                                       ";
        /// <summary>
        /// 构建决策树,决策分类属性约定放在第1列。
        /// </summary>
        /// <param name="Inputs">行表示属性,列为值,注意列等长</param>
        /// <param name="PNode">父节点</param>
        /// <param name="PropertyNames">测试属性名称</param>
        /// <param name="TestProperties">当前可用测试属性索引</param>
        /// <param name="DefaultClassFactor">缺省判别决策分类因子</param>
        /// <param name="CallLevel">用来测试输出控制,无实际作用</param>
        /// <param name="OutContents">输出内容,为调试用</param>
        /// <param name="PropertyFactors">属性因子</param>
        public void BuildDecisionTree(int CallLevel, ref string OutContents, string[][] Inputs, DecisionTreeNode PNode, string[] PropertyNames, List<int> TestProperties, string DefaultClassFactor, Dictionary<string, List<string>> PropertyFactors)
        {
            
            string thePrefix = PrefixString.Substring(0, CallLevel * 2);
            CallLevel++;
            //如果没有测试属性,将当前节点设为叶子节点,选择高概率分类,然后返回
            if (TestProperties.Count <= 1)
            {
                PNode.Type = "叶子";
                PNode.DecisionValue = DefaultClassFactor;
                return;
            }
            //如果没有学习样本集,将当前节点设为叶子节点,选择高概率分类,然后返回
            if (PNode.DataIndexes.Count <= 0)
            {
                PNode.Type = "叶子";
                PNode.DecisionValue = DefaultClassFactor;
                return;
            }

            if (PropertyFactors == null)
            {
                PropertyFactors = new Dictionary<string, List<string>>();
            }
            //准备存储遍历时的计数存储结构
            Dictionary<string, CalcNode> thePropertyCount = new Dictionary<string, CalcNode>();
            foreach (var theProIndex in TestProperties)
            {
                thePropertyCount.Add(PropertyNames[theProIndex], new CalcNode() { Key = PropertyNames[theProIndex] });
                if (PropertyFactors.ContainsKey(PropertyNames[theProIndex]) == false)
                {
                    PropertyFactors.Add(PropertyNames[theProIndex], new List<string>());
                }
            }
            //遍历当前可遍历的数据,进行统计,为计算各属性熵做准备
            for (int n = 0; n < PNode.DataIndexes.Count; n++)
            {
                int theI = PNode.DataIndexes[n];
                for (int k = 0; k < TestProperties.Count; k++)
                {
                    int theJ = TestProperties[k];
                    var thePropertyCalcNode = thePropertyCount[PropertyNames[theJ]];
                    //对当前属性计数
                    thePropertyCalcNode.CalcCount++;
                    //对第j个属性的当前因子计数
                    thePropertyCalcNode.AddChildren(Inputs[theJ][theI], "测试属性因子", theI, 1);
                    //对第j个属性的当前因子的主分类因子计数
                    thePropertyCalcNode.Children[Inputs[theJ][theI]].AddChildren(Inputs[0][theI], "主分类因子", theI, 1);
                    //统计归纳各属性因子,采用这种方式可以减少循环.
                    if (PropertyFactors[PropertyNames[theJ]].Contains(Inputs[theJ][theI]) == false)
                    {
                        PropertyFactors[PropertyNames[theJ]].Add(Inputs[theJ][theI]);
                    }
                }
            }
            
            //计算信息增益量,获取具有最大信息增益属性
            string theDefaultClassFactor = DefaultClassFactor;
            //初始化最大测试属性熵值.
            double theMaxEA = double.MinValue;
            //记录具有最大熵值属性的索引位置
            int theMaxPropertyIndex = TestProperties[1];
            //总信息熵值,其实就是分类属性的熵值.
            double theTotalEA = 0.0;
            //记录总的样本数,用于估算概率.
            double theTotalSimple = 0;

            for(int theI=0;theI<TestProperties.Count;theI++)
            {
                int thePIndex_1 = TestProperties[theI];
                if (thePIndex_1 == 0)
                {
                    //主分类熵值计算,计算公式与测试属性有所不同.
                    CalcNode theCalcNode = thePropertyCount[PropertyNames[thePIndex_1]];
                    double theCount = theCalcNode.CalcCount;
                    theTotalSimple = theCount;
                    double theMaxSubCount = -1;
                    theTotalEA = 0.0;
                    //求和(-Pj*log2(Pj))
                    foreach (var theSubNode in theCalcNode.Children)
                    {
                        if (theSubNode.Value.CalcCount > 0)
                        {
                            double thePj = theSubNode.Value.CalcCount / theCount;
                            theTotalEA += 0 - thePj * Math.Log(thePj, 2);
                        }
                        if (theMaxSubCount < theSubNode.Value.CalcCount)
                        {
                            theMaxSubCount = theSubNode.Value.CalcCount;
                            theDefaultClassFactor = theSubNode.Key;
                        }
                        //测试输出,跟踪计算路径.
                        OutContents += "\r\n" + thePrefix + theCalcNode.CalcCount + ":: " + PropertyNames[thePIndex_1] + ":: " + theSubNode.Value.Type + " :: " + theSubNode.Key + " :: " + theSubNode.Value.CalcCount; 

                    }
                }
                else
                {
                    //测试属性熵值计算。
                    CalcNode theCalcNode = thePropertyCount[PropertyNames[thePIndex_1]];
                    double theJEA = 0.0;
                    foreach (var theSubNode_1 in theCalcNode.Children)
                    {
                        if (theSubNode_1.Value.CalcCount > 0)
                        {
                            double theSjCount = theSubNode_1.Value.CalcCount;
                            double theSj_1 = theSjCount / theTotalSimple;
                            double theSj_2 = 0.0;
                            
                            foreach (var theSubNode_2 in theSubNode_1.Value.Children)
                            {
                                if (theSubNode_2.Value.CalcCount > 0)
                                {
                                    double thePj_1 = Convert.ToDouble(theSubNode_2.Value.CalcCount) / theSjCount;
                                    theSj_2 += 0.0 - thePj_1 * Math.Log(thePj_1, 2);
                                }
                                OutContents += "\r\n" + thePrefix + theCalcNode.CalcCount + ":: " + PropertyNames[thePIndex_1] + " :: " + theSubNode_1.Value.Type + " :: " + theSubNode_1.Key + " :: " + theSubNode_1.Value.CalcCount
                                     + theSubNode_2.Value.Type + " :: "  + theSubNode_2.Key + " :: " + theSubNode_2.Value.CalcCount; 
                            }
                            theJEA += theSj_1 * theSj_2;
                        }
                        
                    }
                    theJEA = theTotalEA - theJEA;
                    //只记录最大熵值属性信息.
                    if (theMaxEA < theJEA)
                    {
                        theMaxEA = theJEA;
                        theMaxPropertyIndex = thePIndex_1;
                    }
                }
            }
            //如果分类因子只有一个,则置当前节点为叶子节点,设置判定为当前分类因子,然后返回
            if (thePropertyCount[PropertyNames[0]].Children.Count <= 1)
            {
                PNode.Type = "叶子";
                PNode.DecisionValue = theDefaultClassFactor;
                return;
            }
            //具有多个分类因子,还剩有测试属性,则设当前节点为分支节点,准备分支.
            PNode.Type = "分支";
            //1选取最大增益信息量测试属性,做分支处理,做处理,注意属性一旦处理,将不在后续节点中再处理
            //因此需要在测试属性集合中删除所选测试属性.注意保持分类属性在开始索引处(0).
            PNode.Key = PropertyNames[theMaxPropertyIndex];

             CalcNode theCalcNode_2 = thePropertyCount[PropertyNames[theMaxPropertyIndex]];
             List<string> theFactors = PropertyFactors[PropertyNames[theMaxPropertyIndex]];
             List<int> theAvailableTestPs = new List<int>();
             for (int i = 0; i < TestProperties.Count; i++)
             {
                 if (theMaxPropertyIndex != TestProperties[i])
                 {
                     theAvailableTestPs.Add(TestProperties[i]);
                 }
             }
             //对所选测试属性的所有因子进行处理.
             foreach (var theFactor_1 in theFactors)
             {
                 //如果当前因子不在计算中,则添加一个叶子节点,判定为高概率分类。
                 if (theCalcNode_2.Children.ContainsKey(theFactor_1) == false)
                 {
                     DecisionTreeNode theNode_1 = new DecisionTreeNode();
                     theNode_1.ParentFactor = theFactor_1;
                     theNode_1.CalcCount = 0;
                     theNode_1.DecisionValue = theDefaultClassFactor;
                     theNode_1.Parent = PNode;
                     theNode_1.Key = theFactor_1;
                     theNode_1.Type = "叶子";
                     PNode.Children.Add(theFactor_1, theNode_1);
                     continue;
                 }
                 //如果当前因子存在,但不存在样本,则添加一个叶子节点,判定为高概率分类。
                 if (theCalcNode_2.Children[theFactor_1].CalcCount<=0)
                 {
                     DecisionTreeNode theNode_1 = new DecisionTreeNode();
                     theNode_1.ParentFactor = theFactor_1;
                     theNode_1.CalcCount = 0;
                     theNode_1.DecisionValue = theDefaultClassFactor;
                     theNode_1.Parent = PNode;
                     theNode_1.Type = "叶子";
                     theNode_1.Key = theFactor_1;
                     PNode.Children.Add(theFactor_1, theNode_1);
                     continue;
                 }
                 //如果存在,且有学习样本,则添加一个节点,并以此节点递归处理.
                 DecisionTreeNode theNode_2 = new DecisionTreeNode();
                 theNode_2.ParentFactor = theFactor_1;
                 theNode_2.Parent = PNode;
                 theNode_2.Key = theFactor_1;
                 theNode_2.CalcCount = theCalcNode_2.Children[theFactor_1].CalcCount;
                 theNode_2.DataIndexes.AddRange(theCalcNode_2.Children[theFactor_1].DataIndexes);
                 PNode.Children.Add(theFactor_1, theNode_2);
                 BuildDecisionTree(CallLevel, ref OutContents, Inputs, theNode_2, PropertyNames, theAvailableTestPs, theDefaultClassFactor, PropertyFactors);
             }
        }

    }


3、测试代码:

private void button1_Click(object sender, EventArgs e)
        {
            DecisionTreeAlg theAlg = new DecisionTreeAlg();
            string[][] theInputs = new string[4][];
            theInputs[0] = new string[] { "no", "yes", "yes", "yes", "yes", "yes", "no", "yes", "yes", "no" };
            theInputs[1] = new string[] { "s", "s", "l", "m", "l", "m", "m", "l", "m", "s" };
            theInputs[2] = new string[] { "s", "l", "m", "m", "m", "l", "s", "m", "s", "s" };
            theInputs[3] = new string[] { "no", "yes", "yes", "yes", "no", "no", "no", "no", "no", "yes" };

            string[] thePropertyName = new string[] {"是否真实帐号","日志密度","好友密度","是否真实头像" };

            DecisionTreeNode theRootNode = new DecisionTreeNode();
            theRootNode.DataIndexes.AddRange(new List<int>() { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 });

            List<int> theTestPs = new List<int>() { 0, 1, 2, 3 };
            string theOuts = "";
            theAlg.BuildDecisionTree(0,ref theOuts, theInputs, theRootNode, thePropertyName, theTestPs, "", null);
            this.treeView1.Nodes.Clear();
            TreeNode theRoot = new TreeNode();
            this.treeView1.Nodes.Add(theRoot);
            VisitTree(theRoot, theRootNode);
            this.textBox1.Text = theOuts;
        }
        private void VisitTree(TreeNode PNode, DecisionTreeNode PDNode)
        {
            PNode.Text = PDNode.Key + "(" + PDNode.Type + ")[判定:"+PDNode.DecisionValue +"]";
            foreach (var theNode in PDNode.Children.Values)
            {
                TreeNode theTmpNode = new TreeNode();
                PNode.Nodes.Add(theTmpNode);
                VisitTree(theTmpNode, theNode);
            }
        }


分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics