Repository /Rseslib/rseslib-3.0.2.jar:rseslib.processing.discretization.EntropyMinStaticDiscretizationProvider


Back

No file description

Source code

/*
 * $RCSfile: EntropyMinStaticDiscretizationProvider.java,v $
 * $Revision: 1.1 $
 * $Date: 2009/05/14 10:10:13 $
 * $Author: wojna $
 * 
 * Copyright (C) 2002 - 2007 Logic Group, Institute of Mathematics, Warsaw University
 * 
 *  This file is part of Rseslib.
 *
 *  Rseslib is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  Rseslib is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */


package rseslib.processing.discretization;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.TreeMap;

import rseslib.structure.data.DoubleData;
import rseslib.structure.table.DoubleDataTable;

/**
 * This class represents a method of discretizing continuous attributes based on
 * a minimal enthropy heuristic, static version (1993 Fayyad & Irani).
 * 
 * @author Marcin Jałmużna
 */
public class EntropyMinStaticDiscretizationProvider extends
		AbstractDiscretizationProvider {

	// Dla ka�dej warto�ci dyskretyzowanego atrybutu obliczana jest 
	// ilo�� rekord�w z t� warto�ci� oraz rozk�ad na decyzje
	private TreeMap<Double, Record> decisionDistribution;

	public EntropyMinStaticDiscretizationProvider() {
		super(0);
	}
    /**
     * Creates discretization cuts for one attribute.
     * Main method of this discretization provider.
     * 
     * @param attribute				Selected attribute for discretization.
     * @param number_of_intervals 	Not important
     * @param table 				Data used for estimating the best cuts.
     * @return Discretization cuts 
     */
	double[] generateCuts(int attribute, int number_of_intervals,
			DoubleDataTable table) {
		decisionDistribution = new TreeMap<Double, Record>();
		Iterator<DoubleData> iter = table.getDataObjects().iterator();
		while (iter.hasNext()) {
			DoubleData dd = iter.next();
			Double tmp = Double.valueOf(dd.get(attribute));
			Double dec = Double.valueOf(dd.get(
					table.attributes().decision()));
			if (decisionDistribution.containsKey(tmp)) {
				Record r = decisionDistribution.get(tmp);
				r.setSum(r.getSum() + 1);
				if (r.getDecisions().containsKey(dec)) {
					r.getDecisions().put(dec, r.getDecisions().get(dec) + 1);
				} else {
					r.getDecisions().put(dec, 1);
				}
			} else {
				Record r = new Record();
				r.setSum(1);
				r.setDecisions(new HashMap<Double, Integer>());
				r.getDecisions().put(dec, 1);
				decisionDistribution.put(tmp, r);
			}
		}

		double[] cuts = discretize(decisionDistribution.firstKey(), decisionDistribution
				.lastKey());
		if(cuts==null)
			cuts = new double[0];
		
//		for (int i = 0; i < cuts.length; i++) {
//			System.out.println("attribute: " + attribute + " cut: " + cuts[i]);
//		}
		return cuts;
	}

    /**
     * Method that generate cuts from given range;
     * 
     * @param start 	start of range.
     * @param end 		end of range.
     * @return array 	generated cuts.
     */
	private double[] discretize(Double start, Double end) {
		if ((start == null) || (end == null))
			return null;
		if (start >= end)
			return null;
		// przegladamy tabele z danymi i wyciagamy wszystkie wartosci
		// etykiety(decyzji)
		ArrayList<Double> decisions = new ArrayList<Double>();
		double dataSize = 0;
		for (Double tmp = start; (tmp!=null)&&(tmp <= end); tmp = decisionDistribution.higherKey(tmp)) {
			for (Double d : decisionDistribution.get(tmp).getDecisions().keySet()) {
				if (!decisions.contains(d)) {
					decisions.add(d);
				}
			}
			dataSize += decisionDistribution.get(tmp).getSum();
		}
		double min = decisionDistribution.firstKey();
		double min_val = Double.MAX_VALUE;
		double min_e_lw = 0;
		double min_e_gt = 0;
		int[][] min_tab = new int[decisions.size()][2];

		// zerujemy tablice
		int[][] tab = new int[decisions.size()][2];
		for (int j = 0; j < decisions.size(); j++) {
			tab[j][0] = 0;
			tab[j][1] = 0;
		}
		double lw = 0;
		double gt = dataSize;
		for (Double tmp = start; (tmp!=null)&&(tmp <= end); tmp = decisionDistribution.higherKey(tmp)) {
			for (Double d : decisionDistribution.get(tmp).getDecisions().keySet()) {
				tab[decisions.indexOf(d)][1] += decisionDistribution.get(tmp)
						.getDecisions().get(d);
			}
		}

		// przegladamy wszystkie rekordy i wybieramy ten z najmniejsza wartoscia
		// entropii
		for (Double tmp = start; tmp < end; tmp = decisionDistribution.higherKey(tmp)) {
			lw += decisionDistribution.get(tmp).getSum();
			gt -= decisionDistribution.get(tmp).getSum();

			for (Double d : decisionDistribution.get(tmp).getDecisions().keySet()) {
				tab[decisions.indexOf(d)][0] += decisionDistribution.get(tmp)
						.getDecisions().get(d);
				tab[decisions.indexOf(d)][1] -= decisionDistribution.get(tmp)
						.getDecisions().get(d);
			}

			// liczymy entropi� dla danego ciecia
			double e_lw = 0;
			double e_gt = 0;
			for (int j = 0; j < decisions.size(); j++) {
				if (lw > 0) {
					double v1 = ((double) tab[j][0]) / (double) lw;
					if (v1 != 0)
						e_lw -= v1 * log2(v1);

				}
				if (gt > 0) {
					double v2 = (((double) tab[j][1]) / (double) gt);
					if (v2 != 0)
						e_gt -= v2 * log2(v2);

				}
			}


			double val = (((double) lw / dataSize) * e_lw)
					+ (((double) gt / dataSize) * e_gt);
			// uaktualniamy minimum
			if (val < min_val) {
				min_val = val; // minimalna entropia
				min = tmp; // minimum val
				min_e_lw = e_lw;
				min_e_gt = e_gt;
				min_tab = tab.clone();
			}
		}

		double Dp0 = 0;
		double Dp1 = 0;
		double Dp = decisions.size();
		double ent = 0;
		for (int i = 0; i < Dp; i++) {
			double v = ((double) (min_tab[i][0] + min_tab[i][1])) / dataSize;
			if (v > 0)
				ent -= v * log2(v);
			if (min_tab[i][0] > 0)
				Dp0++;
			if (min_tab[i][1] > 0)
				Dp1++;
		}

		double infoGain = ent - min_val;

		double delta = (log2(Math.pow(3.0, Dp) - 2.0))
				- ((Dp * ent) - (Dp0 * min_e_lw) - (Dp1 * min_e_gt));

		double weight = (log2(dataSize - 1.0) + delta) / dataSize;		
		
		if (infoGain < weight)
			return null;

		double[] ret1 = discretize(start, min);
		double[] ret2 = discretize(decisionDistribution.higherKey(min), end);

		return concatenate(ret1, ((min + decisionDistribution.higherKey(min)) / 2.0),
				ret2);

	}
    /**
     * Method that concatenate two arrays and one value.
     * 
     * @param t1 		first array to concatenate.
     * @param v 		value to put between t1 and t2.
     * @param t2 		second array to concatenate.
     * @return array 	concatenated array.
     */
	private double[] concatenate(double[] t1, double v, double[] t2) {
		int t1_length = 0;
		int t2_length = 0;
		if (t1 != null)
			t1_length = t1.length;
		if (t2 != null)
			t2_length = t2.length;

		double ret[] = new double[t1_length + t2_length + 1];
		if (t1 != null)
			System.arraycopy(t1, 0, ret, 0, t1_length);

		ret[t1_length] = v;
		if (t2 != null)
			System.arraycopy(t2, 0, ret, t1_length + 1, t2_length);

		return ret;
	}
    /**
     * Method that compute logarithm (base 2) of given param.
     */
	public static double log2(double d) {
		return Math.log(d) / Math.log(2.0);
	}

	private class Record {
		private int sum = 0;
		private HashMap<Double, Integer> decisions = new HashMap<Double, Integer>();

		public int getSum() {
			return sum;
		}

		public void setSum(int sum) {
			this.sum = sum;
		}

		public HashMap<Double, Integer> getDecisions() {
			return decisions;
		}

		public void setDecisions(HashMap<Double, Integer> decisions) {
			this.decisions = decisions;
		}

	}
}

Copyright © 2008-2011 by TunedIT
Design by luksite