Repository /Rseslib/rseslib-3.0.1.jar:rseslib.structure.attribute.formats.RseslibHeaderReader


Back

No file description

Source code

/*
 * $RCSfile: RseslibHeaderReader.java,v $
 * $Revision: 1.11 $
 * $Date: 2007/06/30 17:30:33 $
 * $Author: wojna $
 * 
 * Copyright (C) 2002 - 2007 Logic Group, Institute of Mathematics, Warsaw University
 * 
 *  This file is part of Rseslib.
 *
 *  Rseslib is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  Rseslib is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */


package rseslib.structure.attribute.formats;

import java.io.IOException;
import java.io.Reader;
import java.io.StreamTokenizer;
import java.util.ArrayList;
import java.util.Collection;

import rseslib.structure.attribute.Attribute;
import rseslib.structure.attribute.NominalAttribute;
import rseslib.structure.attribute.NumericAttribute;

/**
 * Reader for data header in rseslib format.
 *
 * @author      Arkadiusz Wojna
 */
public class RseslibHeaderReader implements HeaderReader
{
    /** The header start keyword. */
    public static final String HEADER_START_KEYWORD = "beginheader";
    /** The keyword in the missing value line in a data header. */
    public static final String MISSING_VALUE_KEYWORD = "missing_value";
    /** The header end keyword. */
    public static final String HEADER_END_KEYWORD = "endheader";

    /** The set of strings that denote missing values. */
    private Collection<String> m_MissingValues = new ArrayList<String>();
    /** The first missing value enumerated in header file. */
    private String m_Missing = null;
    /**
     * The table indicating which attributes
     * are to be read in while loading data from file.
     */
    private boolean[] m_AttrLoaded;
    /** Array of attributes (read in only). */
    private Attribute[] m_arrAttributes;

    /**
     * Constructor.
     *
     * @param input        Input for reading header information.
     * @throws IOException if an I/O error has occured.
     */
    public RseslibHeaderReader(Reader input) throws IOException, HeaderFormatException
    {
        StreamTokenizer st = new StreamTokenizer(input);
        st.lowerCaseMode(false);
        st.eolIsSignificant(true);
        st.whitespaceChars(',', ',');
        st.wordChars('_', '_');
        st.wordChars('-', '-');
        st.wordChars('?', '?');
        st.wordChars('\'', '\'');
        st.quoteChar('"');
        ArrayList<Attribute> attrTypes = new ArrayList<Attribute>();
        int noOfAttr = 0;
        boolean endExpected = false;
        boolean beginning = true;
        boolean cont = true;
        while (cont && st.nextToken() != StreamTokenizer.TT_EOF)
        {
            if (st.ttype!=StreamTokenizer.TT_EOL)
                if (st.ttype=='#') while (st.nextToken()!=StreamTokenizer.TT_EOF && st.ttype!=StreamTokenizer.TT_EOL);
                else if (st.ttype=='\\')
                {
                    if (beginning)
                    {
                        if (st.nextToken()!=StreamTokenizer.TT_WORD
                                 || HEADER_START_KEYWORD.indexOf(st.sval)!=0)
                                 throw new HeaderFormatException("Wrong command found at the beginning of header file");
                        endExpected = true;
                        if (st.nextToken() != StreamTokenizer.TT_EOL) throw new HeaderFormatException("Header start command not in a seperated line");
                    }
                    else
                    {
                        if (!endExpected) throw new HeaderFormatException("\\ found in wrong place");
                        if (st.nextToken() != StreamTokenizer.TT_WORD
                                 || HEADER_END_KEYWORD.indexOf(st.sval) != 0)
                                 throw new HeaderFormatException("Wrong command found in header file");
                        if (st.nextToken() != StreamTokenizer.TT_EOL && st.ttype != StreamTokenizer.TT_EOF) throw new HeaderFormatException("Header end command not in a seperated line");
                        cont = false;
                    }
                }
                else
                {
                    if (st.ttype != StreamTokenizer.TT_WORD) throw new HeaderFormatException("The beginning of the line "+st.lineno()+" is neither an attribute name nor the comment character '#' nor the '"+MISSING_VALUE_KEYWORD+"' keyword");
                    if (st.sval.equalsIgnoreCase(MISSING_VALUE_KEYWORD))
                        while (st.nextToken()!=StreamTokenizer.TT_EOF && st.ttype!=StreamTokenizer.TT_EOL)
                        {
                            String missing = null;
                            if (st.ttype == StreamTokenizer.TT_WORD) missing = st.sval;
                            else
                            {
                                char[] missingString = new char[1];
                                missingString[0] = (char)st.ttype;
                                missing = new String(missingString);
                            }
                            if (m_Missing == null) m_Missing = missing;
                            if (!m_MissingValues.contains(missing)) m_MissingValues.add(missing);
                        }
                    else
                    {
                        String name = st.sval;
                        Attribute.Type attrType = null;
                        Attribute.ValueSet valueSetType = Attribute.ValueSet.nonapplicable;
                        boolean skip = false;
                        while (st.nextToken()!=StreamTokenizer.TT_EOF && st.ttype!=StreamTokenizer.TT_EOL && st.ttype!='{')
                        {
                            if (st.ttype != StreamTokenizer.TT_WORD) throw new HeaderFormatException("Unknown attribute feature in the line "+st.lineno());
                            String keyword = st.sval.toLowerCase();
                            boolean recognised = false;
                            if (Attribute.Type.conditional.name().indexOf(keyword)==0)
                            {
                                if (attrType != null) throw new HeaderFormatException("Attribute type in the line "+st.lineno()+" defined ambiguously");
                                attrType = Attribute.Type.conditional;
                                recognised = true;
                            }
                            if (Attribute.Type.decision.name().indexOf(keyword)==0)
                            {
                                if (attrType != null) throw new HeaderFormatException("Attribute type in the line "+st.lineno()+" defined ambiguously");
                                attrType = Attribute.Type.decision;
                                recognised = true;
                            }
                            if (Attribute.Type.text.name().indexOf(keyword)==0)
                            {
                                if (attrType != null) throw new HeaderFormatException("Attribute type in the line "+st.lineno()+" defined ambiguously");
                                attrType = Attribute.Type.text;
                                recognised = true;
                            }
                            if (Attribute.ValueSet.numeric.name().indexOf(keyword)==0)
                            {
                                if (valueSetType != Attribute.ValueSet.nonapplicable) throw new HeaderFormatException("Value set type in the line "+st.lineno()+" defined ambiguously");
                                valueSetType = Attribute.ValueSet.numeric;
                                recognised = true;
                            }
                            if (Attribute.ValueSet.nominal.name().indexOf(keyword)==0)
                            {
                                if (valueSetType != Attribute.ValueSet.nonapplicable) throw new HeaderFormatException("Value set type in the line "+st.lineno()+" defined ambiguously");
                                valueSetType = Attribute.ValueSet.nominal;
                                recognised = true;
                            }
                            if ("skip".indexOf(keyword)==0)
                            {
                                skip = true;
                                recognised = true;
                            }
                            if (!recognised) throw new HeaderFormatException("Unknown attribute feature "+st.sval+" in the line "+st.lineno());
                        }
                        if (skip) attrTypes.add(null);
                        else
                        {
                            if (attrType == null) attrType = Attribute.Type.conditional;
                            if (attrType==Attribute.Type.text)
                                valueSetType = Attribute.ValueSet.nonapplicable;
                            else
                                if (valueSetType==Attribute.ValueSet.nonapplicable) throw new HeaderFormatException("Value set type required for the attribute in the line "+st.lineno());
                            switch (valueSetType)
                            {
                                case nonapplicable:
                                case nominal:
                                    attrTypes.add(new NominalAttribute(attrType, name));
                                    break;
                                case numeric:
                                   attrTypes.add(new NumericAttribute(attrType, name));
                                    break;
                                default:
                                    throw new HeaderFormatException("Unused attribute type in the line "+st.lineno());
                            }
                            noOfAttr++;
                        }
                        if (st.ttype=='{')
                        {
                        	if (valueSetType!=Attribute.ValueSet.nominal && valueSetType!=Attribute.ValueSet.nonapplicable)
                                throw new HeaderFormatException("Value enumaration for non-nominal attibute in the line "+st.lineno());
                        	NominalAttribute nom = null;
                        	if (!skip) nom = (NominalAttribute)attrTypes.get(attrTypes.size()-1);
                            while (st.nextToken()!='}')
                            {
                                if (st.ttype!='"') throw new HeaderFormatException("Unknown attribute feature in the line "+st.lineno());
                                if (!skip) nom.globalValueCode(st.sval);
                            }
                            if (st.nextToken()!=StreamTokenizer.TT_EOF && st.ttype!=StreamTokenizer.TT_EOL)
                            	throw new HeaderFormatException("Unexpected information after value enumeration in the line "+st.lineno());
                        }
                    }
                }
            beginning = false;
        }
        if (noOfAttr <= 0) throw new HeaderFormatException("No attributes were found in the header file");
        m_AttrLoaded = new boolean[attrTypes.size()];
        m_arrAttributes = new Attribute[noOfAttr];
        int origAttr = 0;
        for (; origAttr < attrTypes.size() && attrTypes.get(origAttr)==null; origAttr++) m_AttrLoaded[origAttr] = false;
        for (int attr = 0; attr < m_arrAttributes.length; attr++)
        {
            if (origAttr >= attrTypes.size()) throw new RuntimeException("Internal error of header loader");
            m_AttrLoaded[origAttr] = true;
            m_arrAttributes[attr] = (Attribute)attrTypes.get(origAttr);
            origAttr++;
            for (; origAttr < attrTypes.size() && attrTypes.get(origAttr)==null; origAttr++) m_AttrLoaded[origAttr] = false;
        }
        if (origAttr != attrTypes.size()) throw new RuntimeException("Internal error of header loader");
    }

    /**
     * Returns the set of all strings denoting missing value.
     *
     * @return Set of all strings denoting missing value.
     */
    public Collection<String> allMissing()
    {
        return m_MissingValues;
    }

    /**
     * Returns missing value.
     *
     * @return String that denotes the missing value.
     */
    public String singleMissing()
    {
        return m_Missing;
    }

    /**
     * Returns the bit mask indicating
     * which original attributes are to be read in
     * while loading data from file.
     *
     * @return The bit mask where true at a position i
     *         indicates that attribute i is to be read in
     *         and false indicates that the attribute is to be skipped.
     */
    public boolean[] bitMaskOfLoaded()
    {
        return m_AttrLoaded;
    }

    /**
     * Returns the information about attributes (loaded only).
     *
     * @return Array of attributes.
     */
    public Attribute[] attributesForLoading()
    {
        return m_arrAttributes;
    }
}

Copyright © 2008-2011 by TunedIT
Design by luksite