Repository /Debellor/debellor-1.0.jar:org.debellor.weka.DataConverter


Back

No file description

Source code

/*
 *  Debellor
 *
 *  Copyright (C) 2008-2009 by Marcin Wojnarski
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see http://www.gnu.org/licenses/.
 */
package org.debellor.weka;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.io.StringWriter;
import java.io.Writer;

import org.debellor.core.DataObject;
import org.debellor.core.DataType;
import org.debellor.core.Sample;
import org.debellor.core.Sample.SampleType;
import org.debellor.core.data.DataVector;
import org.debellor.core.data.NumericFeature;
import org.debellor.core.data.SymbolicFeature;
import org.debellor.core.data.DataVector.DataVectorType;
import org.debellor.core.data.NumericFeature.NumericFeatureType;
import org.debellor.core.data.SymbolicFeature.SymbolicFeatureType;
import org.debellor.core.exception.data.DataException;

import weka.core.Instance;
import weka.core.Instances;

/**
 * Conversion of data structures between Weka and Debellor representation.
 * 
 * @author Marcin Wojnarski
 *
 */
public class DataConverter {

	/**
	 * @param instances  Data in Weka format. Class index must be set 
	 *        if non-null {@code Sample.decision} has to be generated.  
	 * @return sample type in Debellor format
	 */
	public static SampleType sampleTypeFrom(Instances instances) 
		throws WekaConversionException 
	{
		int nattr = instances.numAttributes();
		int decisionIndex = instances.classIndex();
		
		int nattrData = nattr;	// no. of features in Sample.data
		if((decisionIndex >= 0) && (decisionIndex < nattr))
			nattrData--;

		DataType[] dataType = new DataType[nattrData];
		DataType decisionType = DataType.NOT_PRESENT;
		
		for(int i = 0, iData = 0; i < nattr; i++) 
		{
			DataType attrType = null;

			weka.core.Attribute wekaAttr = instances.attribute(i);
			switch(wekaAttr.type()) {
				case weka.core.Attribute.NUMERIC:
					attrType = new NumericFeatureType();
					break;
				case weka.core.Attribute.NOMINAL:
					int nval = wekaAttr.numValues();
					String[] values = new String[nval];
					for(int j = 0; j < nval; j++)
						values[j] = wekaAttr.value(j);
					attrType = new SymbolicFeatureType(values);
					break;
				default:
					throw new WekaConversionException(wekaAttr.type(), i);
			}
			
			if(i == decisionIndex)
				decisionType = attrType;
			else
				dataType[iData++] = attrType;
		}

		return new SampleType(new DataVectorType(dataType), decisionType);
	}

	/**
	 * @param instance
	 * @param sampleType
	 * @return new {@code Sample}, converted from Weka {@code instance}
	 * @throws DataException 
	 */
	public static Sample sampleFrom(Instance instance, SampleType sampleType)
		throws WekaConversionException, DataException 
	{
		if(instance.weight() != 1) 
			throw new WekaConversionException(
					"Weka to Debellor conversion error. Weights of samples are not supported.");
	
		int nattr = instance.numAttributes();
		int decisionIndex = instance.classIndex();

		DataVectorType type = (DataVectorType) sampleType.data;
		int nattrData = type.size();
		
		DataObject[] attrs = new DataObject[nattrData];
		DataObject decision = null;
		
		for(int i = 0, iData = 0; i < nattr; i++) 
		{
			DataType attrType = (DataType) 
				((i == decisionIndex) ? sampleType.decision : type.get(iData));
			DataObject attr = null;

			if(!instance.isMissing(i)) {
				double value = instance.value(i);

				if(attrType.dataClass == NumericFeature.class) {
					attr = new NumericFeature(value);
				}
				else if(attrType.dataClass == SymbolicFeature.class) {
					attr = new SymbolicFeature((int)value, attrType);
				}
				else throw new WekaConversionException("Weka to Debellor conversion error. " +
						"Feature type " + attrType.dataClass.getName() + " is not supported. " +
						"Index of the feature: " + i + ".");
			}

			if(i == decisionIndex)
				decision = attr;
			else
				attrs[iData++] = attr;
		}

		return new Sample(new DataVector(attrs), decision);
	}

	public static Instances instancesFrom(SampleType sampleType) 
		throws WekaConversionException 
	{
		// write Weka header to a string, then pass it to Instances constructor
		try {
			Writer header = new StringWriter(1024);
			header.write("@relation relationName\n");
			
			// write description of input attributes
			DataVectorType data = (DataVectorType) sampleType.data;
			for(int i = 0; i < data.size(); i++)
				writeAttrHeader(header, "a" + (i+1), data.get(i));
			
			// write description of decision attribute
			if(sampleType.hasDecision())
				writeAttrHeader(header, "decision", sampleType.decision);			
			
			header.write("@data\n");
			Reader reader = new StringReader(header.toString());
			Instances instances = new Instances(reader, 1);
			if(sampleType.hasDecision())
				instances.setClassIndex(instances.numAttributes() - 1);
			
			return instances;
		}
		catch(Exception e) {
			throw new WekaConversionException(e);
		}
	}

	private static void writeAttrHeader(Writer header, String name, DataType attr)
		throws IOException 
	{
		header.write("@attribute " + name + " ");
		if(attr.dataClass == NumericFeature.class)
			header.write("real");
		else if(attr.dataClass == SymbolicFeature.class) {
			header.write("{");
			header.write(((SymbolicFeatureType) attr).toString(",", "'"));
			header.write("}");
		}
		header.write("\n");
	}

	public static Instance instanceFrom(Sample sample, SampleType sampleType)
		throws DataException 
	{
		DataVector vector = sample.data.asDataVector();
		DataVectorType vectorsType = sampleType.data.asDataVectorType();
		int wekaSize = vector.size();  
		if(sampleType.hasDecision())
			wekaSize++;
		double[] values = new double[wekaSize];
		
		// insert input attributes
		int i = 0;
		for(; i < vector.size(); i++)
			if(vector.isMissing(i))
				values[i] = Instance.missingValue();
			else
				values[i] = valueFromFeature(vector.get(i), vectorsType.get(i));
		
		// insert decision attribute at the end
		if(i < wekaSize)
			if(sample.decision == null)
				values[i] = Instance.missingValue();
			else
				values[i] = valueFromFeature(sample.decision, sampleType.decision);
		
		Instance instance = new Instance(1.0, values);
		return instance;
	}

	private static double valueFromFeature(DataObject d, DataType t) throws DataException {
		if(d instanceof NumericFeature)
			return d.asNumericFeature().value;
		else {
			SymbolicFeatureType st = t.asSymbolicFeatureType();
			return st.codeOf( d.asSymbolicFeature().value );
		}
	}

}



Copyright © 2008-2011 by TunedIT
Design by luksite