Code covered by the BSD License  

Highlights from
Spardat2SSD

from Spardat2SSD by Skynet
Convert a data file from spardat to SSD format.

spardat2ssd(FileIn,FileOut,DataType,DispInterval)
function spardat2ssd(FileIn,FileOut,DataType,DispInterval)

%SPARDAT2SSD Convert a data file from Spardat to SSD format.
%
% SPARDAT2SSD(FILEIN,FILEOUT,DATATYPE,DISPINTERVAL) converts the contents
% of the input file FILEIN to the output file FILEOUT. FILEIN is the name
% of a data file in Spardat format, particularly such as that used by
% SVM-Light. FILEOUT is the name of a data file in Simple Sparse Dataset
% (SSD) format, particularly such as that used by Auton Lab.
%
% The argument DATATYPE is optional. Its value can be either Categorical or
% Real, with the default being Categorical. Categorical pertains to a data
% file which has attribute values of only 1. Real pertains to a data file
% which has real attribute values, e.g. -4, 3, 3.14, etc. If set to
% Categorical (default), the output data file will have two columns, with
% the first column representing the row number (starting from 0), and the
% second column representing the column number (with the class being column
% 1). If set to Real, the output data file will also have a third column -
% this represents the real numbered attribute value.
%
% The argument DISPINTERVAL is also optional. This argument controls the
% frequency of display of the conversion status. Its default value is 100,
% which means that the status will display after processing every 100 lines
% from the input file. The status will also display once the input file has
% been fully processed. If set to 0, the status will never display.
%
% EXAMPLES:
%
%   spardat2ssd('spardat_categorical.sample.data','ssd_categorical.sample.csv','Categorical',2)
%   spardat2ssd('spardat_real.sample.data','ssd_real.sample.csv','Real',0)
%   spardat2ssd('spardat.data','ssd.data')
%   spardat2ssd('spardat.data','ssd.data','Real')
%
% REMARKS:
%
%   - The input file must contain cases that are only two-class. The class
%     value must be represented in the first column of the input file.
%     Positive classes must be represented as 1, and negative classes must
%     be represented as either -1 or 0.
%   - Lines beginning with the # character in the input file are ignored as
%     comments. Additionally, anything after the # character in any line of
%     the input file is also ignored as a comment.
%   - Cases that do not have any stated feature values are processed
%     correctly.
%   - At least at the time of writing this, Auton Lab's software products
%     do not seem to support the SSD format output file containing real
%     numbered attribute values. This output therefore might not have any
%     practical use.
%   - Very limited testing of the source code has been done. Moreover,
%     there is a lot of room to optimize it, especially for conciseness.
%
% VERSION: 20070414
% MATLAB VERSION: 7.4.0.287 (R2007a)

%{
VERSION HISTORY:
20070414: - Added a remark that empty cases are processed correctly.
          - Removed a prohibitive validity check.
20070323: - Set default value of argument DATATYPE to Categorical.
          - Added option to never display status.
          - Replaced : with squeeze function.
20070322: - Added support for real numbered attribute values.
20070321: - Original version.

KEYWORDS:
conversion, sparse, spardat, SSD, SVM-Light, SVMLight, SVM, Auton Lab, 
machine learning
%}

%% Process input

if nargin<2
    help spardat2ssd
    return
end

File.In=FileIn;
File.Out=FileOut;
clear FileIn FileOut

if nargin<3
    DataType='Categorical';
end
File.DataType=DataType;
clear DataType

if nargin<4
    DispInterval=100;
end

%% Open files
File.InID=fopen(File.In,'r');
File.OutID=fopen(File.Out,'w');

%% Main

%Initialize some items
File.InLineCount=0;
File.InLine=[];
File.InLineCommentIndices=[];
File=orderfields(File);
Case.Num=0;

%Main loop
while 1
    
    %Read line of input file
    File.InLine=fgetl(File.InID);
    File.InLineCount=File.InLineCount+1;
    
    %Exit if reached end of file
    if isequal(File.InLine,-1)
        if DispInterval
            disp(['Processed ',num2str(File.InLineCount),' lines']);
        end
        fclose('all');
        break
    end
    
    %Process line (if it is not empty and is not a comment)
    File.InLine=strtrim(File.InLine);
    if ~isempty(File.InLine) && File.InLine(1)~='#'
        
        Case.Num=Case.Num+1;

        %Strip away comment from line if it exists
        File.InLineCommentIndices=strfind(File.InLine,'#');
        if ~isempty(File.InLineCommentIndices)
            File.InLine=File.InLine(1:File.InLineCommentIndices(1)-1);
            File.InLine=deblank(File.InLine);
        end
        
        %Parse line using reg exps
        Case.Targ=regexp(File.InLine,'(?<Targ>^[^ ]+)','names');
        Case.Targ=str2double(Case.Targ.Targ);

        Case.Atts=regexp(File.InLine,' (?<Att>\d+):','names');
        Case.Atts=struct2cell(Case.Atts);
        Case.Atts=squeeze(Case.Atts);
        Case.Atts=cellfun(@str2double,Case.Atts);

        Case.Vals=regexp(File.InLine,':(?<Val>[^ ]+)','names');
        Case.Vals=struct2cell(Case.Vals);
        Case.Vals=squeeze(Case.Vals);
        Case.Vals=cellfun(@str2double,Case.Vals);
        
        %Generate output matrix
        File.OutMatrix=matrix4ssd(Case,File.DataType);

        %Write to file
        switch File.DataType
          case 'Categorical'
            fprintf(File.OutID,'%1.0f,%1.0f\n',File.OutMatrix);
          case 'Real'
            fprintf(File.OutID,'%1.0f,%1.0f,%1u\n',File.OutMatrix);
        end
        
    end
    
    %Conditonally display status
    if DispInterval && ~rem(File.InLineCount,DispInterval)
        disp(['Processed ',num2str(File.InLineCount),' lines']);
    end
    
end

%**************************************************************************

%% Subfunction convert2ssd

function[FileOutMatrix]=matrix4ssd(Case,DataType)

%% Check validity of target
if Case.Targ~=-1 && Case.Targ~=0 && Case.Targ~=1
    error(['Targets must be -1 or 0, and 1. Case ',num2str(Case.Num),...
           ' has target ',num2str(Case.Targ),'.'])
end

%% Transform input as relevant

%Decrement case number
Case.Num=Case.Num-1;

%Conditionally transform target
if Case.Targ==-1
    Case.Targ=0;
end

%Increment attributes
Case.Atts=Case.Atts+1;

%Conditionally combine target with attributes and values
if Case.Targ==1
    Case.Atts=[1;Case.Atts];
    Case.Vals=[1;Case.Vals];
end

%% Generate output matrix

Case.NumMatrix=repmat(Case.Num,[1 numel(Case.Atts)]);

switch DataType
case 'Categorical'
  FileOutMatrix=[Case.NumMatrix; Case.Atts'];
case 'Real'
  FileOutMatrix=[Case.NumMatrix; Case.Atts'; Case.Vals'];
end

Contact us at files@mathworks.com