Repository /Rseslib/rseslib-3.0.1.jar:rseslib.structure.data.formats.RseslibDoubleDataInput


Back

No file description

Source code

/*
 * $RCSfile: RseslibDoubleDataInput.java,v $
 * $Revision: 1.12 $
 * $Date: 2007/06/30 17:30:33 $
 * $Author: wojna $
 * 
 * Copyright (C) 2002 - 2007 Logic Group, Institute of Mathematics, Warsaw University
 * 
 *  This file is part of Rseslib.
 *
 *  Rseslib is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  Rseslib is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */


package rseslib.structure.data.formats;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.LineNumberReader;

import rseslib.structure.attribute.ArrayHeader;
import rseslib.structure.attribute.BadHeaderException;
import rseslib.structure.attribute.Header;
import rseslib.structure.attribute.NominalAttribute;
import rseslib.structure.attribute.formats.HeaderFormatException;
import rseslib.structure.attribute.formats.RseslibHeaderReader;
import rseslib.structure.data.DoubleData;
import rseslib.structure.data.DoubleDataObject;
import rseslib.system.progress.Progress;

/**
 * Input stream for data in the format with a separated header file.
 *
 * @author      Arkadiusz Wojna
 */
public class RseslibDoubleDataInput implements DoubleDataInput
{

    /** Reader from data file. */
    private LineNumberReader m_DataReader;
    /** Header. */
    private ArrayHeader m_Header;
    /** Switch indicating whether a comma is the separator between attribute values. */
    private boolean m_CommaAsValueSeparator = true;
    /** Separator at the end of data line. */
    private String m_LineEnding = null;
    /** Line with the next data object to be read. */
    String m_Line = null;
    /** Progress object for progress reporting. */
    private Progress m_Progress;
    /** Size of data file */
    private long m_FileSize;
    /** Approximate number of bytes read. */
    private int m_BytesRead = 0;
    /** Approximate percentage of data loaded. */
    private int m_Percentage = 0;

    /**
     * Constructs this input stream
     * and reads attribute information
     * from data file.
     *
     * @param dataFile       Data file to be loaded.
     * @param prog           Progress object for progress reporting.
     * @throws IOException   If an I/O error occurs.
     * @throws InterruptedException  If user has interrupted reading data.
     */
    public RseslibDoubleDataInput(File dataFile, Progress prog) throws IOException, HeaderFormatException, DataFormatException, InterruptedException
    {
        if (!containsRseslibHeader(dataFile)) throw new DataFormatException("Unknown data header for file "+dataFile.getPath());
        m_DataReader = new LineNumberReader(new FileReader(dataFile));
        m_Header = new ArrayHeader(m_DataReader);
        detectDataSeparators(dataFile);
        m_Progress = prog;
        m_Progress.set("Loading data from "+dataFile.getPath(), 100);
        m_FileSize = dataFile.length();
        if (m_DataReader.ready()) m_Line = m_DataReader.readLine();
        else close();
    }

    /**
     * Constructs this input stream
     * and verifies information about attributes
     * from data file.
     *
     * @param dataFile       Data file to be loaded.
     * @param hdr            Header to be verified with data specification in a file.
     * @param prog           Progress object for progress reporting.
     * @throws IOException   If an I/O error occurs.
     * @throws InterruptedException  If user has interrupted reading data.
     */
    public RseslibDoubleDataInput(File dataFile, Header hdr, Progress prog) throws IOException, HeaderFormatException, BadHeaderException, InterruptedException
    {
        if (hdr.getClass()!=ArrayHeader.class) throw new BadHeaderException("Only ArrayHeader can be used for loading data from "+dataFile.getPath());
        m_DataReader = new LineNumberReader(new FileReader(dataFile));
        if (containsRseslibHeader(dataFile))
        {
            if (!verifyAttributeTypes(m_DataReader, hdr)) throw new BadHeaderException("Incompatible data header in file "+dataFile.getPath());
        }
        m_Header = (ArrayHeader)hdr;
        detectDataSeparators(dataFile);
        m_Progress = prog;
        m_Progress.set("Loading data from "+dataFile.getPath(), 100);
        m_FileSize = dataFile.length();
        if (m_DataReader.ready()) m_Line = m_DataReader.readLine();
        else close();
    }

    /**
     * Checks whether the file contains rseslib data header at the beginning.
     *
     * @param dataFile Data file to be checked.
     * @return         True if data header is contained, false otherwise.
     * @throws IOException if an I/O error has occured.
     */
    private boolean containsRseslibHeader(File dataFile) throws IOException
    {
        BufferedReader bw = new BufferedReader(new FileReader(dataFile));
        boolean isHeader = false;
        if (bw.ready() && bw.readLine().indexOf("\\"+RseslibHeaderReader.HEADER_START_KEYWORD)==0)
            isHeader = true;
        bw.close();
        return isHeader;
    }

    /**
     * Verifies compatibility of an rses table
     * with a given header.
     *
     * @param input    Input for header information to be verified.
     * @param hdr      Header to be verified.
     * @return         True if headers are compatible, false otherwise.
     * @throws IOException   If an I/O error occurs.
     */
    private static boolean verifyAttributeTypes(LineNumberReader input, Header hdr) throws BadHeaderException, HeaderFormatException, IOException
    {
        if (hdr.getClass()!=ArrayHeader.class) throw new BadHeaderException("Only ArrayHeader can be used for data reading");
        ArrayHeader new_hdr = new ArrayHeader(input);
        if (new_hdr.noOfAttr()!=hdr.noOfAttr()) return false;
        for (String val : new_hdr.missingValues())
            if (!hdr.isMissing(val)) return false;
        for (int attr = 0; attr < new_hdr.noOfAttr(); attr++)
        {
            if (new_hdr.isNominal(attr) && !hdr.isNominal(attr)) return false;
            if (new_hdr.isNumeric(attr) && !hdr.isNumeric(attr)) return false;
            if (new_hdr.isText(attr) && !hdr.isText(attr)) return false;
            if (new_hdr.isConditional(attr) && !hdr.isConditional(attr)) return false;
            if (new_hdr.isDecision(attr) && !hdr.isDecision(attr)) return false;
        }
        return true;
    }

    /**
     * Detects type of attribute value separator and line ending.
     *
     * @param dataFile     Data file.
     * @throws IOException If an I/O error occurs.
     */
    private void detectDataSeparators(File dataFile) throws IOException
    {
        BufferedReader dataReader = new BufferedReader(new FileReader(dataFile));
        String lines[] = new String[2];
        if (dataReader.ready())
        {
            lines[0] = dataReader.readLine();
            if (lines[0].indexOf("\\"+RseslibHeaderReader.HEADER_START_KEYWORD)==0)
            {
                while (dataReader.ready() &&
                       dataReader.readLine().indexOf("\\"+RseslibHeaderReader.HEADER_END_KEYWORD)!=0);
                if (dataReader.ready()) lines[0] = dataReader.readLine();
                else lines[0] = null;
            }
            if (lines[0] != null)
                while ((lines[0].length()==0 || lines[0].charAt(0)=='#') && dataReader.ready())
                    lines[0] = dataReader.readLine();
            if (lines[0].length()==0 || lines[0].charAt(0)=='#') lines[0] = null;
        }
        if (dataReader.ready())
        {
            lines[1] = dataReader.readLine();
            while ((lines[1].length()==0 || lines[1].charAt(0)=='#') && dataReader.ready())
                lines[1] = dataReader.readLine();
            if (lines[1].length()==0 || lines[1].charAt(0)=='#') lines[1] = null;
        }
        boolean dotEnding = true;
        for (int l = 0; m_CommaAsValueSeparator && l < lines.length; l++)
            if (lines[l]!=null)
            {
                int commaIndex = lines[l].indexOf(',');
                int commas = 0;
                while (commaIndex != -1)
                {
                    commas++;
                    commaIndex = lines[l].indexOf(',', commaIndex+1);
                }
                m_CommaAsValueSeparator &= (commas==m_Header.noOfAttrInFile()-1);
                if (lines[l].charAt(lines[l].length()-1)!='.') dotEnding = false;
            }
        if (dotEnding) m_LineEnding = ".";
        dataReader.close();
    }

     /**
      * Returns the array of attributes.
      *
      * @return Array of attributes.
      */
     public Header attributes()
     {
         return m_Header;
     }

     /**
      * Returns true if there is more data to be read, false otherwise.
      *
      * @return True if there is more data to be read, false otherwise.
      * @throws IOException If an I/O error occurs.
      * @throws InterruptedException  If user has interrupted reading data.
      */
    public boolean available() throws IOException, InterruptedException
    {
        if (m_Line!=null)
        {
            while ((m_Line.length()==0 || m_Line.charAt(0)=='#') && m_DataReader.ready())
            {
                m_BytesRead += m_Line.length()+1;
                m_Line = m_DataReader.readLine();
            }
            if (m_Line.length()==0 || m_Line.charAt(0)=='#') close();
        }
        return (m_Line!=null);
    }

    /**
     * Reads a new data from this stream.
     *
     * @return Read data.
     * @throws IOException If an I/O error occurs.
     * @throws InterruptedException  If user has interrupted reading data.
     */
    public DoubleData readDoubleData() throws IOException, DataFormatException, InterruptedException
    {
        if (!available()) throw new IOException("An atempt of reading data from an empty data input");
        int attrIndex = 0;
        int valueIndex = 0;
        DoubleData dObject = new DoubleDataObject(m_Header);
        while (valueIndex < m_Line.length() && Character.isWhitespace(m_Line.charAt(valueIndex))) valueIndex++;
        for (int att = 0; att < m_Header.noOfAttrInFile(); att++)
        {
            if (valueIndex>=m_Line.length()) throw new DataFormatException("Too few attributes in line "+m_DataReader.getLineNumber());
            int sepIndex = valueIndex;
            if (m_CommaAsValueSeparator)
            {
                sepIndex = m_Line.indexOf(',', valueIndex);
                if (att==m_Header.noOfAttrInFile()-1)
                {
                    if (sepIndex!=-1) throw new DataFormatException("Too many attributes in the data line "+m_DataReader.getLineNumber());
                    if (m_LineEnding==null) sepIndex = m_Line.length();
                    else
                    {
                        sepIndex = m_Line.indexOf(m_LineEnding, valueIndex);
                        if (sepIndex==-1) throw new DataFormatException("Line ending not found at the end of the data line "+m_DataReader.getLineNumber());
                    }
                }
                else if (sepIndex==-1) throw new DataFormatException("Too few attributes in the data line "+m_DataReader.getLineNumber());
            }
            else while (sepIndex < m_Line.length() && !Character.isWhitespace(m_Line.charAt(sepIndex))) sepIndex++;
            if (m_Header.attrInFileLoaded(att))
            {
                String value = m_Line.substring(valueIndex, sepIndex);
                if (m_Header.isInterpretable(attrIndex))
                {
                    if (m_Header.isMissing(value))
                        dObject.set(attrIndex, Double.NaN);
                    else if (m_Header.isNominal(attrIndex))
                    {
                        double doubleValue = ((NominalAttribute)m_Header.attribute(attrIndex)).globalValueCode(value);
                        dObject.set(attrIndex, doubleValue);
                    }
                    else if (m_Header.isNumeric(attrIndex))
                        dObject.set(attrIndex, Double.parseDouble(value));
                }
                else if (m_Header.isText(attrIndex))
                    dObject.set(attrIndex, ((NominalAttribute)m_Header.attribute(attrIndex)).globalValueCode(value));
                attrIndex++;
            }
            if (m_CommaAsValueSeparator && att < m_Header.noOfAttrInFile()-1) valueIndex = sepIndex + 1;
            else valueIndex = sepIndex;
            while (valueIndex < m_Line.length() && Character.isWhitespace(m_Line.charAt(valueIndex))) valueIndex++;
            if (att==m_Header.noOfAttrInFile()-1)
                if (m_LineEnding==null)
                {
                    if (valueIndex!=m_Line.length()) throw new DataFormatException("The data line "+m_DataReader.getLineNumber()+" too long");
                }
                else if (valueIndex != m_Line.length()-m_LineEnding.length()) throw new DataFormatException("The data line "+m_DataReader.getLineNumber()+" too long");
        }
        m_BytesRead += m_Line.length()+1;
        while (100*m_BytesRead >= (m_Percentage+1)*m_FileSize)
        {
            m_Percentage++;
            m_Progress.step();
        }
        if (m_DataReader.ready()) m_Line = m_DataReader.readLine();
        else close();
        return dObject;
    }

    /**
     * Closes data input.
     *
     * @throws IOException If an I/O error occurs.
     * @throws InterruptedException  If user has interrupted reading data.
     */
    private void close() throws IOException, InterruptedException
    {
        m_Line = null;
        m_DataReader.close();
        while (m_Percentage < 100)
        {
            m_Percentage++;
            m_Progress.step();
        }
    }
}

Copyright © 2008-2011 by TunedIT
Design by luksite