/*
* Debellor
*
* Copyright (C) 2008-2009 by Marcin Wojnarski
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package org.debellor.core;
import org.debellor.core.Cell.Stream;
import org.debellor.core.data.DataVector;
import org.debellor.core.data.NumericFeature;
import org.debellor.core.data.SymbolicFeature;
/**
* Sample of data, also known as an instance/object/vector,
* the basic unit of data transfer between cells (see {@link Stream#next()}).
* Sample is composed of input {@link #data} and an associated
* {@link #decision} (output data).
* Samples are constant (<i>immutable</i>),
* like {@link String} objects, so you may freely share them
* without risk of accidental modification.
*
* <p>In contrast to some other data mining systems, e.g. Weka,
* Debellor's samples may contain various types of data and decisions,
* not necessarily vectors.
* The {@code data} and {@code decision} fields are declared as references to the base
* {@link DataObject} class, so it is possible to add new data types
* by defining new subclasses of Data.
* When the cell receives a sample, it usually has to downcast manually
* the contained Data objects to specific subclasses, as expected by this cell,
* in order to process the sample.
*
* <p>It is up to the cell which fields (<code>data</code>, <code>decision</code>)
* of the sample it actually uses.
* The cell may choose to read and/or write both,
* only one or none of them - this depends on the type of the cell
* (is it a decision system? preprocessing algorithm? etc.),
* its parameters (e.g., a cell could
* take a parameter which controls whether the processing is applied to
* <code>data</code> or <code>decision</code>)
* and whether the sample is presented at the input
* or generated at the output of the cell.
* Every cell should define a <i>contract</i> which specifies
* what type of samples is expected at the input
* and what type of samples is generated at the output.
*
* <p>If the cell wants to know in advance what type of samples will be generated
* by <code>Stream.next()</code> of input stream, it may read the {@link SampleType}
* from {@link Stream#sampleType} field - its value
* is available immediately after the stream in opened,
* so the cell may prepare internal structures as necessary for a given data type,
* e.g., arrays of appropriate length if the data will be composed of vectors.
*
* <p>On the other hand, before the cell starts generating output samples,
* it should create a sampleType object describing the samples to be produced
* as precisely as possible. This object should be returned from
* overriden {@link Cell#onOpen()}.
* Providing a meaningful (non-null) sampleType object is <i>not</i> obligatory,
* but in other case the usability of the cell is low,
* because most cells that could be connected to the given cell as consumers
* would fail on runtime due to unhandled type of input data.
*
* <p>Algorithms from Weka and Rseslib libraries
* operate on samples whose {@code data} field is a {@link DataVector}
* composed of {@link NumericFeature} or {@link SymbolicFeature} objects,
* while the {@code decision} is a single feature object.
*
* @see Stream#next()
* @see Cell#onNext()
*
* @author Marcin Wojnarski
*
*/
public final class Sample extends DataObject {
/**
* Describes common properties of all {@link Sample} objects in a given
* data {@link Stream}.
* {@code SampleType} objects are immutable.
* @see Cell#open()
* @see Cell#onOpen()
*/
public static final class SampleType extends DataType
{
/** Type of {@link Sample#data} field.
* Can take on special values defined in DataType, like
* {@link DataType#UNKNOWN} or {@link DataType#NOT_PRESENT}. */
public final DataType data;
/** Type of {@link Sample#decision} field.
* Can take on special values defined in DataType, like
* {@link DataType#UNKNOWN} or {@link DataType#NOT_PRESENT}.
* <code>NOT_PRESENT</code> means that samples do not have decisions,
* they are composed of 'data' part alone. */
public final DataType decision;
public SampleType(DataType data, DataType decision) {
super(Sample.class);
this.data = data;
this.decision = decision;
}
public SampleType setData(DataType data) {
return new SampleType(data, decision);
}
public SampleType setDecision(DataType decision) {
return new SampleType(data, decision);
}
/** Returns true if samples of the data set contain decisions and their type is known (defined).
* Note that still some decisions (even all of them) may have <code>null</code> values. */
public boolean hasDecision() {
return (decision != DataType.NOT_PRESENT) && (decision != DataType.UNKNOWN);
}
@Override
public String toString() {
return data + " => " + decision;
}
}
/** Input data on which data processing algorithms will primarily work.
* Can be {@code null} for some or all samples in a data set.
* May have an associated {@link #decision}. */
public final DataObject data;
/** Decision (also known as target/decision/prediction/output value)
* associated with the {@link #data}.
* Either assigned by a supervisor (ground truth / target)
* OR predicted by a decision system (prediction / output value).
* Can be {@code null} for some or all samples in a data set. */
public final DataObject decision;
public Sample(DataObject data, DataObject decision) {
this.data = data;
this.decision = decision;
}
public Sample setData(DataObject data) {
return new Sample(data, decision);
}
public Sample setDecision(DataObject decision) {
return new Sample(data, decision);
}
@Override
public String toString() {
return data + " => " + decision;
}
@Override
public boolean equals(Object obj) {
if(this == obj) return true;
if((obj == null) || !(obj instanceof Sample)) return false;
Sample s = (Sample) obj;
if((data != s.data) && ((data == null) || !data.equals(s.data)))
return false;
if((decision != s.decision) && ((decision == null) || !decision.equals(s.decision)))
return false;
return true;
}
@Override
public int hashCode() {
int hData = (data == null) ? 0 : data.hashCode();
int hDecision = (decision == null) ? 0 : decision.hashCode();
return hData ^ hDecision;
}
// public boolean conformsTo(sampleType type) {
// return true;
// }
}