/*
* Debellor
*
* Copyright (C) 2008-2009 by Marcin Wojnarski
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package org.debellor.weka;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.io.StringWriter;
import java.io.Writer;
import org.debellor.core.DataObject;
import org.debellor.core.DataType;
import org.debellor.core.Sample;
import org.debellor.core.Sample.SampleType;
import org.debellor.core.data.DataVector;
import org.debellor.core.data.NumericFeature;
import org.debellor.core.data.SymbolicFeature;
import org.debellor.core.data.DataVector.DataVectorType;
import org.debellor.core.data.NumericFeature.NumericFeatureType;
import org.debellor.core.data.SymbolicFeature.SymbolicFeatureType;
import org.debellor.core.exception.data.DataException;
import weka.core.Instance;
import weka.core.Instances;
/**
* Conversion of data structures between Weka and Debellor representation.
*
* @author Marcin Wojnarski
*
*/
public class DataConverter {
/**
* @param instances Data in Weka format. Class index must be set
* if non-null {@code Sample.decision} has to be generated.
* @return sample type in Debellor format
*/
public static SampleType sampleTypeFrom(Instances instances)
throws WekaConversionException
{
int nattr = instances.numAttributes();
int decisionIndex = instances.classIndex();
int nattrData = nattr; // no. of features in Sample.data
if((decisionIndex >= 0) && (decisionIndex < nattr))
nattrData--;
DataType[] dataType = new DataType[nattrData];
DataType decisionType = DataType.NOT_PRESENT;
for(int i = 0, iData = 0; i < nattr; i++)
{
DataType attrType = null;
weka.core.Attribute wekaAttr = instances.attribute(i);
switch(wekaAttr.type()) {
case weka.core.Attribute.NUMERIC:
attrType = new NumericFeatureType();
break;
case weka.core.Attribute.NOMINAL:
int nval = wekaAttr.numValues();
String[] values = new String[nval];
for(int j = 0; j < nval; j++)
values[j] = wekaAttr.value(j);
attrType = new SymbolicFeatureType(values);
break;
default:
throw new WekaConversionException(wekaAttr.type(), i);
}
if(i == decisionIndex)
decisionType = attrType;
else
dataType[iData++] = attrType;
}
return new SampleType(new DataVectorType(dataType), decisionType);
}
/**
* @param instance
* @param sampleType
* @return new {@code Sample}, converted from Weka {@code instance}
* @throws DataException
*/
public static Sample sampleFrom(Instance instance, SampleType sampleType)
throws WekaConversionException, DataException
{
if(instance.weight() != 1)
throw new WekaConversionException(
"Weka to Debellor conversion error. Weights of samples are not supported.");
int nattr = instance.numAttributes();
int decisionIndex = instance.classIndex();
DataVectorType type = (DataVectorType) sampleType.data;
int nattrData = type.size();
DataObject[] attrs = new DataObject[nattrData];
DataObject decision = null;
for(int i = 0, iData = 0; i < nattr; i++)
{
DataType attrType = (DataType)
((i == decisionIndex) ? sampleType.decision : type.get(iData));
DataObject attr = null;
if(!instance.isMissing(i)) {
double value = instance.value(i);
if(attrType.dataClass == NumericFeature.class) {
attr = new NumericFeature(value);
}
else if(attrType.dataClass == SymbolicFeature.class) {
attr = new SymbolicFeature((int)value, attrType);
}
else throw new WekaConversionException("Weka to Debellor conversion error. " +
"Feature type " + attrType.dataClass.getName() + " is not supported. " +
"Index of the feature: " + i + ".");
}
if(i == decisionIndex)
decision = attr;
else
attrs[iData++] = attr;
}
return new Sample(new DataVector(attrs), decision);
}
public static Instances instancesFrom(SampleType sampleType)
throws WekaConversionException
{
// write Weka header to a string, then pass it to Instances constructor
try {
Writer header = new StringWriter(1024);
header.write("@relation relationName\n");
// write description of input attributes
DataVectorType data = (DataVectorType) sampleType.data;
for(int i = 0; i < data.size(); i++)
writeAttrHeader(header, "a" + (i+1), data.get(i));
// write description of decision attribute
if(sampleType.hasDecision())
writeAttrHeader(header, "decision", sampleType.decision);
header.write("@data\n");
Reader reader = new StringReader(header.toString());
Instances instances = new Instances(reader, 1);
if(sampleType.hasDecision())
instances.setClassIndex(instances.numAttributes() - 1);
return instances;
}
catch(Exception e) {
throw new WekaConversionException(e);
}
}
private static void writeAttrHeader(Writer header, String name, DataType attr)
throws IOException
{
header.write("@attribute " + name + " ");
if(attr.dataClass == NumericFeature.class)
header.write("real");
else if(attr.dataClass == SymbolicFeature.class) {
header.write("{");
header.write(((SymbolicFeatureType) attr).toString(",", "'"));
header.write("}");
}
header.write("\n");
}
public static Instance instanceFrom(Sample sample, SampleType sampleType)
throws DataException
{
DataVector vector = sample.data.asDataVector();
DataVectorType vectorsType = sampleType.data.asDataVectorType();
int wekaSize = vector.size();
if(sampleType.hasDecision())
wekaSize++;
double[] values = new double[wekaSize];
// insert input attributes
int i = 0;
for(; i < vector.size(); i++)
if(vector.isMissing(i))
values[i] = Instance.missingValue();
else
values[i] = valueFromFeature(vector.get(i), vectorsType.get(i));
// insert decision attribute at the end
if(i < wekaSize)
if(sample.decision == null)
values[i] = Instance.missingValue();
else
values[i] = valueFromFeature(sample.decision, sampleType.decision);
Instance instance = new Instance(1.0, values);
return instance;
}
private static double valueFromFeature(DataObject d, DataType t) throws DataException {
if(d instanceof NumericFeature)
return d.asNumericFeature().value;
else {
SymbolicFeatureType st = t.asSymbolicFeatureType();
return st.codeOf( d.asSymbolicFeature().value );
}
}
}