《数据挖掘决策树算法Java实现.docx》由会员分享,可在线阅读,更多相关《数据挖掘决策树算法Java实现.docx(10页珍藏版)》请在taowenge.com淘文阁网|工程机械CAD图纸|机械工程制图|CAD装配图下载|SolidWorks_CaTia_CAD_UG_PROE_设计图分享下载上搜索。
1、import java.util.HashMap;import java.util.HashSet;import java.util.LinkedHashSet;importjava.util.Iterator;调试过程中发现4个错误,感谢宇宙无敌的调试工具一一print1、selectAtrribute中的一个数组下标出错2、两个字符串相等的判断3、输入的数据有一个错误4、selectAtrribute中最后一个循环忘记了i+决策树的树结点类class TreeNode String element; 该值为数据的属性名称 String value;/上一个分裂属性在此结点的值LinkedH
2、ashSetchiIds; 结点的子结点,以有顺序的链式哈希集存储public TreeNode() this.element = null; this.value = null; this.childs 二 null;public TreeNode(String value) this.element = null;this. value = value;this.childs = null;publicStringreturngetElement()this.element;public void setElement(String e)this.element = e;public St
3、ring getValue()returnthis.value;public void setValue( String v) this.value = v;public LinkedHashSetreturn this.childs;getChilds()构造决策树TreeNode root = new TreeNode();DecisionTree decisionTree = new DecisionTree(root);decisionTree.buildDecisionTree(root, deData, flags, attributes, attrlndexMap);public
4、 void setChilds(LinkedHashSet childs)this.childs = childs;决策树类class DecisionTree TreeNode root; 决策树的树根结点public DecisionTree()root = new TreeNode();public DecisionTree(TreeNode root) this.root = root;public TreeNode getRoot() return root;public void setRoot(T reeN ode root) this.root = root;public St
5、ring selectAtiribute(TreeNode node,String口口 deData, boolean flags,LinkedHashSet atrributes, HashMap attrIndexMap)Gain数组存放当前结点未分类属性的Gain值double Gain= new doubleatrributes.size()每条数据中归类的下标,为每条数据的最后一个值 int class_index = deData0.length - 1;属性名,该结点在该属性上进行分类 String retum_atmbute 二 null;计算每一个未分类属性的Gain值 in
6、t count = 0;计算到第几个属性 for(String atrribute:atrributes) 该属性有多少个值,该属性有多少个分类 int values_count, class_count;属性值对应的下标int index = attrlndexMap.get(atrribute);存放属性的各个值和分类值LinkedHashSet();LinkedHashSet();LinkedHashSet values = newLinkedHashSet classes = newfor(int i =O;ideData.length; i+) if(flagsi=true)valu
7、es.add(deDatai index); classes.add(deDataiclass_index);values_count =values.size();class_count 二 classes.size();class_count;int values_vector= new intvalues_count int class_vector= new intclass_count;for(int i=0;ideData. length;i+)if(flagsi=true)int j =0; for(String v: values)( if(deDatafi index.equ
8、als(v) break;else j+;int k =0for(String c:classes)if(deDatai class_index .equals(c) break; else k+;values_vectorj*class_count+k+; class_vectork+;/* 输出各项统计值for(int i =0; ivalues_count * class_count; i+)System.out.print(values_vectori+,n);/Math.log(2.0)*System.out.println();for(int i =0;iclass_count;
9、i+)System.out.print(class_vectori+n);System.out.println();*/计算 InforDdouble InfoD = ().0;double class_total =0.0;for(int i =();iclass_vector.length; i+) class_total += class_vectori;for(int i = ();iclass_vector.length; i+) if(class_vectori=0)continue; else doubled=Math.log(class_vectori/class_total)
10、class_vectori/class_total;InfoD = InfoD - d;/计算 InfoAdouble InfoA = 0.0;for(int i =(); ivalues_count; i+)double middle = 0.0;doubleattr_count=0.();for(intattr_countj=0;jclass_count;j+4-)+=values_vectori*class_count+j;for(intj=0;jclass_count;j+)if(values_vectori*class_count+j!=0)double k =values_vect
11、orli*class_count+j I;middle = middle -Math. 1 og(k/att r_co un t)/Math.log(2.0)*k/attr_count;InfoA += middleattr_count /class_ total;Gaincount= count+;InfoD- Info A;double max = 0.0;int i =0;for(String atrribute:atrributes) max)max = Gaini;retum_atrribute = atrribute;return return attribute;/node:在当
12、前结点构造决策树/deData,/flags指示在当前结点构造决策树时哪些数据是需要的/attributes:未分类的属性集/attrlndexMap:属性与对应数据下标public void buildDecisionTree(TreeNode node, String deData, boolean flags,LinkedHashSet attributes, HashMap attrIndexMap)如果待分类属性已空if(attributes.isEmpty()- true) 从数据集中选择多数类,遍历符合条件的所有数据HashMap classMap = newHashMap();
13、int classindex = deData0. length - 1;for(int i = 0; ideData.length; i+)if(flagsi= true) if(classMap.containsKey(deDatai classindex) int count = classMap.get(deDatai classindex);classMap.put(deDatai classindex,count+1);JelseclassMap.put(deDatai classindex, 1);选择多数类String mostClass 二 null;int mostCoun
14、t = 0;Iterator it = classMap.keySet().iterator();while(it.hasNext() String strClass =(String)it.next();if( classMap. get( strClass) mostCount) mostClass = strClass;mostCount = classMap. get( strClass)对结点进行赋值,该结点为叶结点 node. setElement(mostClass); node. setChilds(null);System.out.println(yezhi:n+node.g
15、etElement()+,:,+node.getValue(); return; 如果待分类数据全都属于一个类 int class_index 二 deData0J.length - 1;String class_name = null;HashSet classSet = new HashSet()for(int i =0; ideData.length; i+) if(flagsi= true) class_name =deDatai class_index;classSet.add(class_name);则该结点为叶结点,设置有关值,然后返回if(classSet.size()=1)n
16、ode. setElement(class_name);node. setChilds(null);System.out.println(leaf:n+node.getElement()+u:n+node.getValueQ);return;给定的分枝没有元组,是不是有这种情况?/选择一个分类属性String attribute = selectAtrribute(node, deData, flags, attributes,attrlndexMap);设置分裂结点的值node.setElement(attribute);/System.out.println(attribute); if(
17、node = root) System.out.println(nroot:H+node.getElement()+H: n+node.getValue(); else System.out.printlnCbranch:+node.getElement()+n:H+node.getValue(); 生成和设置各个子结点 int attrlndex = attrlndexMap.get(attribute); LinkedHashSet attrValues = new LinkedHashSet(); for(int i = O;ideData.length; i+) if(flagsi=
18、true) attrValues.add(deDatai attrlndex);LinkedHashSet childs = new LinkedHashSet() for(StringattrV alue:attrV alues)TreeNode tn = new TreeNode(attrValue); childs.add(tn);node.setChilds(childs);在候选分类属性中删除当前属性attributes.remove(attribute);在各个子结点上递归调用本函数 if(childs.isEmpty()! =true) for(TreeNode child:ch
19、ilds) 设置子结点待分类的数据集 boolean newFlags= new booleandeData.length for(int i =0; ideData.length; i+) newFlagsi= flagsi; if(deDatai attrlndex! =child.getValue()newFlagsi= false;设置子结点待分类的属性集LinkedHashSet newAttributes 二 new LinkedHashSet();for(String attr:attributes)(newAttributes. add(attr);在子结点上递归生成决策树bu
20、ildDecisionTree(child, deData, newFlags, new Attributes, attrlndexMap);输出决策树public void printDecisionTree() public class DeTree public static void main(String args)/*输入数据集1String deData口口 = new String12;deData0= newString口 “Yes”JNo,“No”JYes,“Some”JhighJNoJYes”JFrench”J(M(rjYes”deDatal= newString口 “
21、Yes,“No JNo“ J Yes,“Fun,“low,“No,“No JThaV J30 60“ JNo”;deData2= newStringNoYesJNo”JNo”JSome”Jk)w”JNo”JNo”JBurger”J0lOJYes;deData3= newString口 ” YesTNo” J Yes”,“ YesFullow”,“ Yes,“No”Thai” J10 30“ J Yes”;deData4= newString口 “Yes”JNo“JYes”,“NoTFull*”high”JNo“JYes”JFrench”J6(TJNo”; deData5= newString
22、口 NoTYesJNo”JYes”JSome”Jmiddle”JYes“JYes”JItaliarr,“010“JYes”;deData6= newString 口 “No” JYes“JNo”JNo”, “None Jlow”JYes“JNo”, “Burger”, “010JNo;deData7= newString 口 ”NoNoJNo,“Yes”JSome”JmiddletYes”,“Yes”JThai”JO-l(TJYes”deData8= newString,No,Tes,nYes,VNo,V,FuirV,low,V,Yes,V,No,V,Burger,V,60,V,Non; de
23、Data9= newString口 “Yes“JYes”JYes”JYes“JFun”Jhigh,“No“JYes”JItalian”J103O,”NO);deData10= newSlring ”No“ JNo“ JNo” JNd,”None,“low“ JNo” JNoThai”0 10“ JNo”;deDatall= newStringnYes,Tes,VYes,V,Yes,VFuirV,lown;,No,V,No,V,Burger;n30-60,V,Yesn/待分类的属性集1newString naltV,bar,nfri,Mhunn;,patn,Stringattr=price” J
24、rain, res,type Jest;*/ /输入数据集2 String deData= new deDataO=newdeDatal=newdeData2=newdeData3J=newdeData 4 =newdeData5=newdeData6= newString14;String nyouthn;,high,Vnon,nfair,V,non;String youth Jhigh,no, excellent, “no;String ,middle_agedn,high,non;,fairVyesn;String nseniorVmedium,no,fairVyes,);String
25、nsenior,V,low,Vyes,V,fairVyesM;String口 senior Jk)wJyesJexceHenno;String middle_aged, low Jyes, excellent Jyes;deData7= deData8= deData9= deData10=newString youth”/medium” Jno” Jfair Jno;newString youth,V,low,V,yesHfair,(;,yesn;newString senior,medium,yes,nfairn,Hyesn;newString youth, medium, yes,exc
26、ellent, yes; deData 11 =newdeData二newdeData13=new/待分类的属性集2String attr= newcrediCrating1String ,middle_aged,1,high,V,yesn;,fair,V,yesn;String nsenior,;,medium,no,excellent,;non;String nageM,income0,student,StringMmiddle_aged,nrnediumVno,,excellent,Vyesn;LinkedHashSet();LinkedHashSet attributes = new for(int i =0; iattr.length; i+) attributes. add(attri);/属性与数据集中对应数据的下标HashMap attrlndexMap = newHashMap();/需要分类的数据, boolean flags= for(iiit i =0;flagsi=for(int i =0; iattr.length; i+) attrIndexMap.put(attri ,i);初始为整个数据集newbooleanfdeData.length;ideDala.length; i+)true;