Code covered by the BSD License  

Highlights from
arffparser

from arffparser by Konstantinos Drossos
M-File for reading and writing / creating arff files to and from MATLAB.

arffparser(mode, fileName, inputStruct, relationName, ...
function out = arffparser(mode, fileName, inputStruct, relationName, ...
    cmmnts)
%ARFFPARSER READS OR WRITES AN ARFF FILE
%   OUT = ARFFPARSER(MODE, FILENAME, INPUTSTRUCT, RELATIONNAME, CMMNTS)
%   Reads (in MODE 'read') or writes (in MODE 'write') an .arff file with
%   file name FILENAME into OUT or from INPUTSTRUCT. 
%
%   In read mode, struct's field names are taken from attribute names with
%   struct's naming equal to WEKA's relation name. FILENAME can be either
%   relative path or absolute path.
%
%   In write mode, attribute names and values are taken from the
%   INPUTSTRUCT. INPUTSTRUCT must have the following form:
%
%                                  --> KIND   (INPUTSTRUCT.ATTRIBUTE.KIND)
%   INPUTSTRUCT ---> ATTRIBUTE --->|
%                                  --> VALUES (INPUTSTRUCT.ATTRIBUTE.VALUE)
%
%   In the ATTRIBUTE field are listed all the attributes. Their
%   corresponding kind is in the KIND field. KIND field can have the
%   following values:
%
%     - 'Numeric', for numeric attributes
%     - 'String', for string attributes
%     - 'Date' for using the default date format
%        (other timestamp formats will be included in upcoming versions)
%     - a vector or cell array (for strings) for nominal values
%
%   Support for string and date attributes will be included in upcoming
%   versions.
%
%   When in read mode, RELATIONNAME, INPUSTRUCT and CMMNTS arguments can be
%   ommited. 
%
%   CMMNTS argument can only be a cell array whos each row is a line to be
%   entered as a comment at the very start of the arff file. The '%'
%   character needs NOT to be included in the cell array (although 
%   inclusion of that character will not regarder as error). This argument
%   is optional.
%
%   String arguments and struct's strings are not case sensitive. 
%
%   -----------------------------------------------------------------------
%
%   EXAMPLES:
%
%   **************************
%   ******* Write mode *******
%
%     >> a = struct('attributes', []);
%
%     >> a.attribute1.kind = 'Numeric';
%     >> a.attribute1.values = 1.234e5;
%     >> a.attribute1.values(2) = 5.678e9;
%           (or a.attribute1.values = [1.234e5 5.678e9];)
%
%     >> a.attribute2.kind = [1 2 3 4];
%     >> a.attribute2.values = 3;
%     >> a.attribute2.values(2) = 4;
%           (or a.attribute2.values = [3 4];)
%
%     >> a.attribute3.kind = 'string';
%     >> a.attribute3.values{1} = 'test1';
%     >> a.attribute3.values{2} = 'test2';
%           (or a.attribute3.values = {'test1' 'test2'};)
%
%     >> arffparser('write', 'testFile', a, 'test relation');
%
%   NOTE BOLD the explicit indication of the cell array when string
%   attributes are used!!!
%
%   The above will create the testfile.arff file which will have the "test
%   relation" WEKA's relation and attributes (in order of appearance): 
%
%   @ATTRIBUTE attribute1 NUMERIC
%   @ATTRIBUTE attribute2 {1 2 3 4}
%   @AttrIBUTE attribute3 string
%
%
%   *************************
%   ******* Read mode *******
%
%     >> a = arffparser('read', 'test');
%
%   If 'test' file is the file created in the previous example, then struct
%   a will have the form:
%
%     attribute1: [1x1 struct]
%     attribute2: [1x1 struct]
%     attribute3: [1x1 struct]
%
%   And etc...
%
%   -----------------------------------------------------------------------
%
%   Authors: Konstantinos Drossos, Dr. Andreas Floros
% 
%   Affiliation: Ionian University, Dept of Audiovisual Arts, Corfu, Greece
%
%   Version: 2.0 
%
%   For bus and needed features, please email to: 
% 
%                  <a href="mailto:kdrosos@ionio.gr";">kdrosos@ionio.gr</a>
%
%   Date: 22/11/2012 (dd-mm-yyyy)
%
%   -----------------------------------------------------------------------
    

    % Check if mode is string
    if ~isa(mode, 'char')
        error('ERROR:ARFFPARSER','Mode argument must be a string');
    end
    
    % Make mode's argument characters lower case
    mode = lower(mode);
    
    % Check the mode and the input arguments
    if ~strcmp(mode, 'read') && ~strcmp(mode, 'write')
        error('ERROR:ARFFPARSER',...
            'Mode argument must be either ''read'' or ''write''');
    end
    
    % The READ mode
    if strcmp(mode, 'read')
        
        % Initial checks
        if nargin < 2
            error('ERROR:ARFFPARSER', 'Too few input arguments');
        end
        
        if nargin > 2
            warning('WARNING:ARFFPARSER',...
                'Extra arguments will be ommited');
        end
        
        try
            if isempty(strfind(fileName, '.arff'))
                fileName = [fileName '.arff'];
            end
            
            fid = fopen(fileName, 'r');
            
            if fid == -1
                error('ERROR:ARFFPARSER', 'File could not be opened')
            end
            
        catch ME
            error('ERROR:ARFFPARSER', 'File not found');
        end

        % Read the file
        theText = textscan(fid, '%s', 'delimiter', '\n');
        theText = theText{1};
        
        % Remove empty lines
        theText = theText(~cellfun(@isempty, theText));
        
        % Make all letters low case
        theText = lower(theText);
        
        % Check if there is a relation naming in the arff file
        if isempty(strfind(theText, '@relation'))
            error('ERROR:ARFFPARSER', ...
                'Not valid relation naming in .arff file');
        end

        % Get only the lines after the relation declaration
        theText = theText(find(~cellfun(@isempty, ...
            strfind(theText, '@relation')) == 1) : length(theText));
        
        tmpVar = {};
        
        % Strip off comments
        for indx = 1:length(theText)
            tmpVar2 = theText{indx};
            if ~strcmp(tmpVar2(1), '%')
                tmpVar{length(tmpVar) + 1, 1} = tmpVar2;
            end
        end
        
        theText = tmpVar;
        
        clear tmpVar;
        
        % Break up the text in two regions
        textAttr = theText(2:( find(~cellfun(@isempty, ...
            strfind(theText, '@data')) == 1) - 1));
        
        textVals = theText(( find(~cellfun(@isempty, ...
            strfind(theText, '@data')) == 1) +1):length(theText));
        
        % Close the file
        fclose(fid);
        
        % Clear unwanted variables
        clear theText;
        clear fid;
        
        % For each line of the attributes text
        for indx = 1:length(textAttr)
            
            % Get the kind of the attribute based on the ending latter
            endingLetter = textAttr{indx}(length(textAttr{indx}));
            
            % Find the starting index of the attribute's name and values
            % (if any)
            
            % Numeric or real attributes
            if strcmp(endingLetter, 'c') || strcmp(endingLetter, 'l')
                
                % Get the defined kind
                if strcmp(endingLetter, 'l')
                    nameStr = 'real';
                else
                    nameStr = 'numeric';
                end                
                
                % Assing values, treating real as numeric
                theKind = 'numeric';
                strKind = 'numeric';
                
                % Assign indices
                indxEnd = length(textAttr{indx}) - length(nameStr);
                indxStart = length('@attribute') + 1;
                
                % Clear unwanted variables
                clear nameStr;
                
            elseif strcmp(endingLetter, '}') % Nominal attributes
                
                % Indicate the kind of the attribute
                theKind = 'nominal';

                % Find the values for the nominal attribute
                indxEnd = strfind(textAttr{indx}, '}') - 1;
                indxStart = strfind(textAttr{indx}, '{') + 1;
                strKind = textAttr{indx};
                strKind = strKind(indxStart:indxEnd);

                % Find the indices for the name of the attribute
                indxStart = length('@attribute') + 1;
                indxEnd = strfind(textAttr{indx}, '{') - 1;
                
            elseif strcmp(endingLetter, 'g') % String attributes
                
                strKind = 'string';
                indxEnd = length('string') + 1;
                indxStart = length('@attribute ') + 1;
                
            elseif strcmp(endingLetter, 'e') % Date attributes
                
                theKind = 'date';
                strKind = 'date';
                indxEnd = length('date') + 1;
                indxStart = length('@attribute ') + 1;
                
            else
                
                q = 'No appropriate kind of attribute';
                error('ERROR:ARFFPARSER',q)
                
            end
            
            % Clear unwanted variables
            clear endingLetter;
            
            % Get the current string
            tmpStr = textAttr{indx};
            
            % Check for spaces and make indices point to no space chars
            while isspace(tmpStr(indxStart))
                indxStart = indxStart + 1;
            end
            
            while isspace(tmpStr(indxEnd))
                indxEnd = indxEnd - 1;
            end
            
            % Check if there are any quote marks and remove them
            if strcmp(textAttr{indx}(indxStart), '"') || ...
                    strcmp(textAttr{indx}(indxStart), '''')
                
                % And if it is increase the index for starting of the name
                indxStart = indxStart +1;
            end
            
            if strcmp(textAttr{indx}(indxEnd), '"') || ...
                    strcmp(textAttr{indx}(indxEnd), '''')
                
                % And if it is decrease the index for ending of the name
                indxEnd = indxEnd - 1;
            end            
            
            % Get the name of the attribute
            strName = textAttr{indx}(indxStart:indxEnd);
            
            % Remove spaces from name
            strName = strName(~isspace(strName));
            
            % Remove punctuation chars
            strName(isstrprop(strName, 'punct')) = [];
            
            % Remove non alphanumeric chars
            strName(~isstrprop(strName, 'alphanum')) = '_';
            
            % Add the attribute to the output struct
            if strcmp(theKind, 'numeric')
                try
                    eval(['out.' strName '.kind =''' strKind ''';']);
                catch me
                    disp('OK')
                end
            elseif strcmp(theKind, 'nominal')
                if isstrprop(strKind(1), 'digit')
                    eval(['out.' strName '.kind = [str2num(strKind)];']);
                else
                    
                    % Check if there are single quotes for the string
                    % argument
                    
                    % Check if the first character is singe quote
                    if ~strcmp(strKind(1), '''')
                        strKind = ['''' strKind];
                    end
                    
                    % Then check position of commas
                    commaInds = strfind(strKind, ',');
                    
                    % Then check for the rest characters and insert single
                    % quote wherever is needed
                    for indx2 = 1:length(commaInds)
                        
                        if ~strcmp(strKind(commaInds(indx2) - 1), '''')
                            
                            strKind = [strKind(1:commaInds(indx2)-1) ...
                                '''' strKind(commaInds(indx2):...
                                length(strKind))];
                            
                            % If something is inserted, get again the
                            % position of commas
                            commaInds = strfind(strKind, ',');
                            
                        end
                        
                        if ~strcmp(strKind(commaInds(indx2) + 1), '''')
                            
                            strKind = [strKind(1:commaInds(indx2)) ...
                                '''' strKind(commaInds(indx2) + 1:...
                                length(strKind))];
                            
                            % If something is inserted, get again the
                            % position of commas
                            commaInds = strfind(strKind, ',');
                            
                        end
                        
                    end
                    
                    % Then check for the last one
                    if ~strcmp(strKind(length(strKind)), '''')
                        strKind = [strKind ''''];
                    end
                    
                    try
                        eval(['out.' strName '.kind = {' strKind '};']);
                    catch me
                        disp('OK')
                    end
                end
            end
            
            % Check the values' kind of the current attribute
            if strcmp(theKind, 'numeric')
                
                kindType = 'digit';
                tmpVal = [];
                
            elseif strcmp(theKind, 'nominal')
                
                tmpStr = textVals{1};
                commaInds = strfind(tmpStr, ',');
                
                switch indx
                    case 1
                        tmpStrIndx = 1;
                    case length(textAttr)
                        tmpStrIndx = commaInds(length(commaInds)) + 1;
                    otherwise
                        tmpStrIndx = commaInds(indx-1) + 1;
                end
                
                if isstrprop(tmpStr(tmpStrIndx), 'alpha') || ...
                        isstrprop(tmpStr(tmpStrIndx), 'punct')
                    kindType = 'string';
                    tmpVal = {};
                elseif isstrprop(tmpStr(tmpStrIndx), 'digit')
                    kindType = 'digit';
                    tmpVal = [];
                end
                
            elseif strcmp(theKind, 'date')
                
                kindType = 'string';
                tmpVal = {};
                
            elseif strcmp(theKind, 'string')
                
                kindType = 'string';
                tmpVal = {};
                
            end
            
            % Get the values for that attribute
            for indx2 = 1:length(textVals)

                % Get the instance string in a variable
                tmpStr = textVals{indx2};
                
                % Remove spaces (if any)
                tmpStr = tmpStr(~isspace(tmpStr));
                
                % Remove commas taken as strings (in single or double
                % quotes, e.g. , 'a string, with a comma' or "a string,
                % with a comma"
                foundSingleQuote = false;
                for indx3 = 1:length(tmpStr)
                    if strcmp(tmpStr(indx3), '''')
                        foundSingleQuote = true;
                    end

                    if foundSingleQuote && ...
                            strcmp(tmpStr(indx3), '''')
                        foundSingleQuote = false;
                    end

                    if strcmp(tmpStr(indx3), ',') && ...
                        foundSingleQuote
                        tmpStr(indx3) = '_';
                    end
                end
                
                foundSingleQuote = false;
                for indx3 = 1:length(tmpStr)
                    if strcmp(tmpStr(indx3), '"')
                        foundSingleQuote = true;
                    end

                    if foundSingleQuote && ...
                            strcmp(tmpStr(indx3), '"')
                        foundSingleQuote = false;
                    end

                    if strcmp(tmpStr(indx3), ',') && ...
                        foundSingleQuote
                        tmpStr(indx3) = '_';
                    end
                end
                
                % Determine when a new value occurs in the string.
                % Get the positions of comma characters
                commaInds = strfind(tmpStr, ','); 
                
                % Read the values from the text string according to value's
                % index
                if indx == 1
                    
                    % Get from the string the first numeric variable
                    tmpVal2 = tmpStr(1:commaInds(indx)-1);
                    
                    if strcmp(kindType, 'digit')
                        
                        % Check for missing values
                        if strcmp(tmpVal2, '?')
                            tmpVal2 = 'NaN';
                        end
                        
                        % Assing the value
                        tmpVal = [tmpVal str2num(tmpVal2)];
                      
                    elseif strcmp(kindType, 'string')
                      
                        % Check and remove the first quote (if any)
                        if strcmp(tmpVal2(1), '''') || ...
                                strcmp(tmpVal2(1), '"')
                            tmpVal2(1) = [];
                        end
                        
                        % Check and remove the last quote (if any)
                        if strcmp(tmpVal2(length(tmpVal2)), '''') || ...
                                strcmp(tmpVal2(length(tmpVal2)), '"')
                            tmpVal2(length(tmpVal2)) = [];
                        end
                        
                        if strcmp(tmpVal2, '?')
                            tmpVal2 = 'NaN';
                        end
                        
                        tmpVal{indx2} = tmpVal2;
                    end
                elseif indx == length(textAttr)
                    
                    % Get from string the proper values
                    tmpVal2 = tmpStr(commaInds(length(commaInds))...
                            + 1 : length(tmpStr));
                    
                    if strcmp(kindType, 'digit')
                        
                        % Check for missing values
                        if strcmp(tmpVal2, '?')
                            tmpVal2 = 'NaN';
                        end
                        
                        % Get from the string the last numeric variable
                        tmpVal = [tmpVal str2num(tmpVal2)];

                    elseif strcmp(kindType, 'string')
                        
                        % Check and remove the first quote (if any)
                        if strcmp(tmpVal2(1), '''') || ...
                                strcmp(tmpVal2(1), '"')
                            tmpVal2(1) = [];
                        end
                        
                        % Check and remove the last quote (if any)
                        if strcmp(tmpVal2(length(tmpVal2)), '''') || ...
                                strcmp(tmpVal2(length(tmpVal2)), '"')
                            tmpVal2(length(tmpVal2)) = [];
                        end
                        
                        if strcmp(tmpVal2, '?')
                            tmpVal2 = 'NaN';
                        end
                        
                        tmpVal{indx2} = tmpVal2;
                    end
                    
                else
                    
                    % Get from string the proper values
                    tmpVal2 = tmpStr(commaInds(indx-1) + 1:...
                        commaInds(indx) - 1);
                    
                    if strcmp(kindType, 'digit')
                        
                        % Check for missing values
                        if strcmp(tmpVal2, '?')
                            tmpVal2 = 'NaN';
                        end
                      
                        % Get from the string the appropriate numeric
                        % variable
                        tmpVal = [tmpVal str2num(tmpVal2)];
                        
                    elseif strcmp(kindType, 'string')
                        
                        % Check and remove the first quote (if any)
                        if strcmp(tmpVal2(1), '''') || ...
                                strcmp(tmpVal2(1), '"')
                            tmpVal2(1) = [];
                        end
                        
                        % Check and remove the last quote (if any)
                        if strcmp(tmpVal2(length(tmpVal2)), '''') || ...
                                strcmp(tmpVal2(length(tmpVal2)), '"')
                            tmpVal2(length(tmpVal2)) = [];
                        end
                        
                        if strcmp(tmpVal2, '?')
                            tmpVal2 = 'NaN';
                        end
                        
                        tmpVal{indx2} = tmpVal2;
                        
                    end
                end
            end
            
            eval(['out.' strName '.values = tmpVal ;'])
        end
        
    % The WRITE mode
    else
        
        % Check if the filename has the .arff extension
        if ~strcmpi(fileName, '.arff')
            fileName = strcat(fileName, '.arff');
        end
        
        % Create the file
        fid = fopen(fileName, 'w');
        
        % Check if the relation's name has the @ character
        if ~strcmpi(relationName(1), '@')
            relationName = ['@RELATION ', relationName];
        end
        
        if nargin == 5
        
            % String for the warning functionality used just below
            q = 'Comments must be in a vector cell array of strings';

            % Check if comments are cell string
            if isvector(cmmnts)

                if iscellstr(cmmnts)

                    for indx = 1:length(cmmnts)
                        fprintf(fid, '%% %s\n', cmmnts{indx});
                    end

                    fprintf(fid, '\n');

                else
                    warning('WARNING:ARFFPARSER',q);
                end

            else
                warning('WARNING:ARFFPARSER',q);
            end
        end
        
        % Print to file the relation's name
        fprintf(fid, '%s', relationName);
        
        % And insert two new lines
        fprintf(fid, '\n\n');
        
        % Get the attributes
        attrs = fieldnames(inputStruct);
            
        % Create a cell array to hold the values for each instance
        cellValues = {};
        
        % And for each attribute start filling the attributes' name fields
        for indx = 1:length(attrs)
            
            % Construct the string to print to the file
            strForFile = ['@ATTRIBUTE "', attrs{indx}, '" '];
            
            % Get the sub struct for processing
            structToProcess = inputStruct.(attrs{indx});
            
            % Get its fields' names
            tmpFields = fieldnames(structToProcess);
            
            % Check if the current attribute has fields
            if isempty(tmpFields)
                q = ['Attribute''s ' attrs{indx} 'fileds not exist'];
                error('ERROR:ARFFPARSER', q)
            end
            
            % Get the kind of the attribute
            try
                theKind = tmpFields{find(strcmpi(tmpFields, 'kind'))==1};
                theKind = structToProcess.(theKind);
            catch me
                q = ['No kind field in ' attrs{indx}];
                error('ERROR:ARFFPARSER', q)
            end
            
            if ischar(theKind)
                
                theKind = lower(theKind);
                switch theKind
                    case 'numeric'
                        strForFile = [strForFile, 'NUMERIC'];
                    case 'string'
                        strForFile = [strForFile, 'STRING'];
                    case 'date'
                        strForFile = [strForFile, ...
                            'DATE "yyyy-MM-dd HH:mm:ss"'];
                end
            else
                
                if isvector(theKind)
                    
                    if (isnumeric(theKind))
                        
                        theKind = num2str(theKind);
                        theKind = theKind(~isspace(theKind));
                        strForFile = [strForFile, ' {'];
                        
                        for indx2 = 1:length(theKind)
                            
                            strForFile = [strForFile theKind(indx2)];
                            
                            if (indx2 ~= length(theKind))
                                
                                strForFile = [strForFile ','];
                                
                            end
                            
                        end
                        
                        strForFile = [strForFile '}'];
                        
                    elseif (iscellstr(theKind))
                        
                        strForFile = [strForFile '{'];
                        
                        for indx2 = 1:length(theKind)
                            
                            theKind{indx2} = ...
                                theKind{indx2}(~isspace(theKind{indx2}));
                            
                            strForFile = [strForFile '''' ...
                                theKind{indx2} ''''];
                            
                            if (indx2 ~= length(theKind))
                                
                                strForFile = [strForFile ','];
                                
                            end
                            
                        end
                        
                        strForFile = [strForFile '}'];
                        
                    else
                        q = ['Nominal number input must be ' ...
                            'either numberic vector ' ...
                            'or cell string vector'];
                        error('ERROR:ARFFPARSER', q)
              
                    end
                else
                    error('ERROR:ARFFPARSER', ...
                        'Nominal input should be a vector');
                end
            end
            
            cellAttributes{indx} = strForFile;
            
            % Get the values
            try
                theValues = tmpFields{strcmpi(tmpFields, 'values')};
                theValues = structToProcess.(theValues);
            catch me
                q = ['No values field in ' attrs{indx}];
                error('ERROR:ARFFPARSER', q)
            end
            
            if ~isvector(theValues)
                q = ['Values must be in a numeric or cell string ' ...
                    'vector'];
                error('ERROR:ARFFPARSER', q);
            end
            
            % Add the value for each instance
            for indx2 = 1:length(theValues)
                if isvector(theValues)
                    if isnumeric(theValues)
                        if ~isnan(theValues(indx2))
                            cellValues{indx2, indx} = ...
                                num2str(theValues(indx2), '%G');
                        else
                            cellValues{indx2, indx} = '?';
                        end
                    elseif iscellstr(theValues)
                        if ~strcmp(theValues{indx2}, 'NaN')
                            cellValues{indx2, indx} = ...
                                ['''' theValues{indx2} ''''];
                        else
                            cellValues{indx2, indx} = '?';
                        end
                    end
                end
            end
        end
        
        finalValues = cell(size(cellValues, 1), 1);
        
        % Add comma character to values cell
        for indx = 1:size(cellValues, 1)
            
            tmpStr = [];
            
            for indx2 = 1:size(cellValues, 2)
                tmpStr = [tmpStr cellValues{indx, indx2}];
                if (indx2 ~= size(cellValues, 2))
                    tmpStr = [tmpStr ','];
                end
            end
            
            finalValues{indx} = tmpStr;
            
        end
        
        % Print the attributes part
        cellfun(@(x) fprintf(fid, '%s\n', x), cellAttributes);
        
        fprintf(fid, '\n');
        
        % Print the data part
        fprintf(fid, '@DATA\n');
        
        cellfun(@(x) fprintf(fid, '%s\n', x), finalValues);
        
        fprintf(fid, '\n');
        fprintf(fid, '%% .arff file created with arffparser.m');
        
        fclose(fid);
        
        out = 1;
    end
end

Contact us