Repository /Debellor/debellor-1.0.jar:org.debellor.core.Sample


Back

No file description

Source code

/*
 *  Debellor
 *
 *  Copyright (C) 2008-2009 by Marcin Wojnarski
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see http://www.gnu.org/licenses/.
 */
package org.debellor.core;

import org.debellor.core.Cell.Stream;
import org.debellor.core.data.DataVector;
import org.debellor.core.data.NumericFeature;
import org.debellor.core.data.SymbolicFeature;



/**
 * Sample of data, also known as an instance/object/vector,
 * the basic unit of data transfer between cells (see {@link Stream#next()}).
 * Sample is composed of input {@link #data} and an associated 
 * {@link #decision} (output data).
 * Samples are constant (<i>immutable</i>),
 * like {@link String} objects, so you may freely share them
 * without risk of accidental modification.
 * 
 * <p>In contrast to some other data mining systems, e.g. Weka,
 * Debellor's samples may contain various types of data and decisions,
 * not necessarily vectors.
 * The {@code data} and {@code decision} fields are declared as references to the base
 * {@link DataObject} class, so it is possible to add new data types
 * by defining new subclasses of Data. 
 * When the cell receives a sample, it usually has to downcast manually 
 * the contained Data objects to specific subclasses, as expected by this cell,
 * in order to process the sample.
 * 
 * <p>It is up to the cell which fields (<code>data</code>, <code>decision</code>) 
 * of the sample it actually uses.
 * The cell may choose to read and/or write both,
 * only one or none of them - this depends on the type of the cell
 * (is it a decision system? preprocessing algorithm? etc.),
 * its parameters (e.g., a cell could
 * take a parameter which controls whether the processing is applied to 
 * <code>data</code> or <code>decision</code>) 
 * and whether the sample is presented at the input
 * or generated at the output of the cell.
 * Every cell should define a <i>contract</i> which specifies
 * what type of samples is expected at the input 
 * and what type of samples is generated at the output. 
 * 
 * <p>If the cell wants to know in advance what type of samples will be generated
 * by <code>Stream.next()</code> of input stream, it may read the {@link SampleType} 
 * from {@link Stream#sampleType} field - its value
 * is available immediately after the stream in opened,
 * so the cell may prepare internal structures as necessary for a given data type,
 * e.g., arrays of appropriate length if the data will be composed of vectors.
 * 
 * <p>On the other hand, before the cell starts generating output samples,
 * it should create a sampleType object describing the samples to be produced
 * as precisely as possible. This object should be returned from 
 * overriden {@link Cell#onOpen()}.
 * Providing a meaningful (non-null) sampleType object is <i>not</i> obligatory,
 * but in other case the usability of the cell is low,
 * because most cells that could be connected to the given cell as consumers
 * would fail on runtime due to unhandled type of input data. 
 * 
 * <p>Algorithms from Weka and Rseslib libraries
 * operate on samples whose {@code data} field is a {@link DataVector}
 * composed of {@link NumericFeature} or {@link SymbolicFeature} objects,
 * while the {@code decision} is a single feature object.
 * 
 * @see Stream#next()
 * @see Cell#onNext()
 * 
 * @author Marcin Wojnarski
 *
 */
public final class Sample extends DataObject {

	/** 
	 * Describes common properties of all {@link Sample} objects in a given
	 * data {@link Stream}.
	 * {@code SampleType} objects are immutable.
	 * @see Cell#open()
	 * @see Cell#onOpen()
	 */
	public static final class SampleType extends DataType
	{
		/** Type of {@link Sample#data} field.
		 * Can take on special values defined in DataType, like
		 * {@link DataType#UNKNOWN} or {@link DataType#NOT_PRESENT}. */
		public final DataType data;

		/** Type of {@link Sample#decision} field. 
		 * Can take on special values defined in DataType, like
		 * {@link DataType#UNKNOWN} or {@link DataType#NOT_PRESENT}.
		 * <code>NOT_PRESENT</code> means that samples do not have decisions,
		 * they are composed of 'data' part alone. */
		public final DataType decision;

		public SampleType(DataType data, DataType decision) {
			super(Sample.class);
			this.data = data;
			this.decision = decision;
		}

		public SampleType setData(DataType data) {
			return new SampleType(data, decision);
		}
		public SampleType setDecision(DataType decision) {
			return new SampleType(data, decision);
		}

		/** Returns true if samples of the data set contain decisions and their type is known (defined). 
		 * Note that still some decisions (even all of them) may have <code>null</code> values. */
		public boolean hasDecision() {
			return (decision != DataType.NOT_PRESENT) && (decision != DataType.UNKNOWN);
		}

		@Override
		public String toString() {
			return data + " => " + decision;
		}
	}

	
	/** Input data on which data processing algorithms will primarily work.
	 * Can be {@code null} for some or all samples in a data set.
	 * May have an associated {@link #decision}. */
	public final DataObject data;
	
	/** Decision (also known as target/decision/prediction/output value) 
	 * associated with the {@link #data}.
	 * Either assigned by a supervisor (ground truth / target) 
	 * OR predicted by a decision system (prediction / output value). 
	 * Can be {@code null} for some or all samples in a data set. */
	public final DataObject decision;
	

	public Sample(DataObject data, DataObject decision) {
		this.data = data;
		this.decision = decision;
	}

	public Sample setData(DataObject data) {
		return new Sample(data, decision);
	}	
	public Sample setDecision(DataObject decision) {
		return new Sample(data, decision);
	}	
	
	@Override
	public String toString() {
		return data + " => " + decision;
	}

	@Override
	public boolean equals(Object obj) {
		if(this == obj) return true;
		if((obj == null) || !(obj instanceof Sample)) return false;
		Sample s = (Sample) obj;
		if((data != s.data) && ((data == null) || !data.equals(s.data)))
			return false;
		if((decision != s.decision) && ((decision == null) || !decision.equals(s.decision)))
			return false;
		return true;
	}

	@Override
	public int hashCode() {
		int hData = (data == null) ? 0 : data.hashCode();
		int hDecision = (decision == null) ? 0 : decision.hashCode();
		return hData ^ hDecision;
	}

	
	
//	public boolean conformsTo(sampleType type) {
//		return true;
//	}

}

Copyright © 2008-2011 by TunedIT
Design by luksite