Repository /Rseslib/rseslib-3.0.1.jar:rseslib.processing.discretization.EntropyMinDynamicDiscretizationProvider


Back

No file description

Source code

/*
 * $RCSfile: EntropyMinDynamicDiscretizationProvider.java,v $
 * $Revision: 1.1 $
 * $Date: 2009/05/14 10:10:13 $
 * $Author: wojna $
 * 
 * Copyright (C) 2002 - 2007 Logic Group, Institute of Mathematics, Warsaw University
 * 
 *  This file is part of Rseslib.
 *
 *  Rseslib is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  Rseslib is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */


package rseslib.processing.discretization;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.TreeMap;

import rseslib.processing.transformation.FunctionBasedAttributeTransformer;
import rseslib.processing.transformation.TransformationProvider;
import rseslib.processing.transformation.Transformer;
import rseslib.structure.attribute.ArrayHeader;
import rseslib.structure.attribute.Attribute;
import rseslib.structure.attribute.Header;
import rseslib.structure.attribute.NumericAttribute;
import rseslib.structure.data.DoubleData;
import rseslib.structure.function.doubleval.AttributeDoubleFunction;
import rseslib.structure.table.ArrayListDoubleDataTable;
import rseslib.structure.table.DoubleDataTable;

/**
 * This class represents a method of discretizing continuous attributes based on
 * a minimal enthropy heuristic, dynamic version (1993 Fayyad & Irani).
 * 
 * @author Marcin Jałmużna
 */
public class EntropyMinDynamicDiscretizationProvider implements
		TransformationProvider {

	/** Key - number of attribute, value - cuts */
	private HashMap<Integer, ArrayList<Double>> cuts;


	/**
	 * Method that generate cuts for given data table;
	 * 
	 * @param tabla data table
	 */
	private void discretize(DoubleDataTable table) {

		double dataSize = table.getDataObjects().size();
		if (dataSize > 1) {
			ArrayList<Double> decisions = new ArrayList<Double>();
			for (DoubleData dd : table.getDataObjects()) {
				Double d = dd.get(dd.attributes().decision());
				if (!decisions.contains(d)) {
					decisions.add(d);
				}

			}

			int minAttr = -1;
			Double minCut = 0.0;
			Double mainMinVal = Double.MAX_VALUE;

			for (int attr = 0; attr < table.attributes().noOfAttr(); attr++) {
				if (table.attributes().isNumeric(attr)) {

					TreeMap<Double, Record> decisionDistribution;
					decisionDistribution = new TreeMap<Double, Record>();
					Iterator<DoubleData> iter = table.getDataObjects()
							.iterator();
					while (iter.hasNext()) {
						DoubleData dd = iter.next();
						Double tmp = Double.valueOf(dd.get(attr));
						Double dec = Double.valueOf(dd.get(table.attributes()
								.decision()));
						if (decisionDistribution.containsKey(tmp)) {
							Record r = decisionDistribution.get(tmp);
							r.setSum(r.getSum() + 1);
							if (r.getDecisions().containsKey(dec)) {
								r.getDecisions().put(dec,
										r.getDecisions().get(dec) + 1);
							} else {
								r.getDecisions().put(dec, 1);
							}
						} else {
							Record r = new Record();
							r.setSum(1);
							r.setDecisions(new HashMap<Double, Integer>());
							r.getDecisions().put(dec, 1);
							decisionDistribution.put(tmp, r);
						}
					}

					double min = decisionDistribution.firstKey();
					double minVal = Double.MAX_VALUE;
					double min_e_lw = 0;
					double min_e_gt = 0;
					int[][] min_tab = null;

					// zerujemy tablice
					int[][] tab = new int[decisions.size()][2];
					for (int j = 0; j < decisions.size(); j++) {
						tab[j][0] = 0;
						tab[j][1] = 0;
					}
					double lw = 0;
					double gt = dataSize;
					for (Double tmp : decisionDistribution.keySet()) {
						for (Double d : decisionDistribution.get(tmp)
								.getDecisions().keySet()) {
							tab[decisions.indexOf(d)][1] += decisionDistribution
									.get(tmp).getDecisions().get(d);
						}
					}

					// przegladamy wszystkie rekordy i wybieramy ten z
					// najmniejsza
					// wartoscia
					// entropii
					for (Double tmp : decisionDistribution.keySet()) {
						lw += decisionDistribution.get(tmp).getSum();
						gt -= decisionDistribution.get(tmp).getSum();

						for (Double d : decisionDistribution.get(tmp)
								.getDecisions().keySet()) {
							tab[decisions.indexOf(d)][0] += decisionDistribution
									.get(tmp).getDecisions().get(d);
							tab[decisions.indexOf(d)][1] -= decisionDistribution
									.get(tmp).getDecisions().get(d);
						}

						// liczymy entropi� dla danego ciecia
						double e_lw = 0;
						double e_gt = 0;
						for (int j = 0; j < decisions.size(); j++) {
							if (lw > 0) {
								double v1 = ((double) tab[j][0]) / (double) lw;
								if (v1 != 0)
									e_lw -= v1 * log2(v1);

							}
							if (gt > 0) {
								double v2 = (((double) tab[j][1]) / (double) gt);
								if (v2 != 0)
									e_gt -= v2 * log2(v2);

							}
						}

						double val = (((double) lw / dataSize) * e_lw)
								+ (((double) gt / dataSize) * e_gt);

						// uaktualniamy minimum
						if ((val < minVal) && (val < mainMinVal)) {
							minVal = val; // minimalna entropia
							min = tmp; // minimum val
							min_e_lw = e_lw;
							min_e_gt = e_gt;
							min_tab = tab.clone();
						}
					}
					if (min_tab != null) {
						double Dp0 = 0;
						double Dp1 = 0;
						double Dp = decisions.size();
						double ent = 0;
						for (int i = 0; i < Dp; i++) {
							double v = ((double) (min_tab[i][0] + min_tab[i][1]))
									/ dataSize;
							if (v > 0)
								ent -= v * log2(v);
							if (min_tab[i][0] > 0)
								Dp0++;
							if (min_tab[i][1] > 0)
								Dp1++;
						}

						double infoGain = ent - minVal;

						double delta = (log2(Math.pow(3.0, Dp) - 2.0))
								- ((Dp * ent) - (Dp0 * min_e_lw) - (Dp1 * min_e_gt));

						double weight = (log2(dataSize - 1.0) + delta)
								/ dataSize;

						if (infoGain >= weight) {
							minAttr = attr;
							mainMinVal = minVal;
							minCut = (min + decisionDistribution.higherKey(min)) / 2.0;
						}

					}

				}
			}

			if (minAttr >= 0) {
				ArrayList<DoubleData> l1 = new ArrayList<DoubleData>();
				ArrayList<DoubleData> l2 = new ArrayList<DoubleData>();

				for (int i = 0; i < table.getDataObjects().size(); i++) {
					DoubleData dd = table.getDataObjects().get(i);
					if (dd.get(minAttr) <= minCut) {
						l1.add(dd);
					} else {
						l2.add(dd);
					}
				}
				cuts.get(minAttr).add(minCut);

				discretize(new ArrayListDoubleDataTable(l1));
				discretize(new ArrayListDoubleDataTable(l2));
			}
		}
	}

	/**
	 * Method that compute logarithm (base 2) of given param.
	 */
	public static double log2(double d) {
		return Math.log(d) / Math.log(2.0);
	}

	private class Record {
		private int sum = 0;
		private HashMap<Double, Integer> decisions = new HashMap<Double, Integer>();

		public int getSum() {
			return sum;
		}

		public void setSum(int sum) {
			this.sum = sum;
		}

		public HashMap<Double, Integer> getDecisions() {
			return decisions;
		}

		public void setDecisions(HashMap<Double, Integer> decisions) {
			this.decisions = decisions;
		}

	}
	
    /**
     * Method that generate discretization based on data table.
     * 
     * @param table 		Data table to estimate the discretization.
     * @return attribute 	Discretization estimated on data table.
     */
	public Transformer generateTransformer(DoubleDataTable table) {
		Attribute[] attributes = new Attribute[table.attributes().noOfAttr()];
		AttributeDoubleFunction discr_table[] = new AttributeDoubleFunction[table
				.attributes().noOfAttr()];

		cuts = new HashMap<Integer, ArrayList<Double>>();
		for (int i = 0; i < table.attributes().noOfAttr(); i++) {
			if (table.attributes().isConditional(i)
					&& table.attributes().isNumeric(i)) {
				cuts.put(i, new ArrayList<Double>());
			}
		}
		discretize(table);

		for (int i = 0; i < attributes.length; i++)
			if (table.attributes().isConditional(i)
					&& table.attributes().isNumeric(i)) {
				Collections.sort(cuts.get(i));

				double[] c = new double[cuts.get(i).size()];
				for (int j = 0; j < c.length; j++) {
					c[j] = cuts.get(i).get(j);
					//System.out.println("attribute: " + i + " cut: " + c[j]);
				}
				NumericAttributeDiscretization disc = new NumericAttributeDiscretization(
						i, (NumericAttribute) table.attributes().attribute(i),
						c);
				attributes[i] = disc.getAttribute();
				discr_table[i] = disc;
			} else
				attributes[i] = table.attributes().attribute(i);

		Header new_header = new ArrayHeader(attributes, table.attributes()
				.missing());

		return new FunctionBasedAttributeTransformer(new_header, discr_table);
	}
}

Copyright © 2008-2011 by TunedIT
Design by luksite