function spardat2ssd(FileIn,FileOut,DataType,DispInterval)
%SPARDAT2SSD Convert a data file from Spardat to SSD format.
%
% SPARDAT2SSD(FILEIN,FILEOUT,DATATYPE,DISPINTERVAL) converts the contents
% of the input file FILEIN to the output file FILEOUT. FILEIN is the name
% of a data file in Spardat format, particularly such as that used by
% SVM-Light. FILEOUT is the name of a data file in Simple Sparse Dataset
% (SSD) format, particularly such as that used by Auton Lab.
%
% The argument DATATYPE is optional. Its value can be either Categorical or
% Real, with the default being Categorical. Categorical pertains to a data
% file which has attribute values of only 1. Real pertains to a data file
% which has real attribute values, e.g. -4, 3, 3.14, etc. If set to
% Categorical (default), the output data file will have two columns, with
% the first column representing the row number (starting from 0), and the
% second column representing the column number (with the class being column
% 1). If set to Real, the output data file will also have a third column -
% this represents the real numbered attribute value.
%
% The argument DISPINTERVAL is also optional. This argument controls the
% frequency of display of the conversion status. Its default value is 100,
% which means that the status will display after processing every 100 lines
% from the input file. The status will also display once the input file has
% been fully processed. If set to 0, the status will never display.
%
% EXAMPLES:
%
% spardat2ssd('spardat_categorical.sample.data','ssd_categorical.sample.csv','Categorical',2)
% spardat2ssd('spardat_real.sample.data','ssd_real.sample.csv','Real',0)
% spardat2ssd('spardat.data','ssd.data')
% spardat2ssd('spardat.data','ssd.data','Real')
%
% REMARKS:
%
% - The input file must contain cases that are only two-class. The class
% value must be represented in the first column of the input file.
% Positive classes must be represented as 1, and negative classes must
% be represented as either -1 or 0.
% - Lines beginning with the # character in the input file are ignored as
% comments. Additionally, anything after the # character in any line of
% the input file is also ignored as a comment.
% - Cases that do not have any stated feature values are processed
% correctly.
% - At least at the time of writing this, Auton Lab's software products
% do not seem to support the SSD format output file containing real
% numbered attribute values. This output therefore might not have any
% practical use.
% - Very limited testing of the source code has been done. Moreover,
% there is a lot of room to optimize it, especially for conciseness.
%
% VERSION: 20070414
% MATLAB VERSION: 7.4.0.287 (R2007a)
%{
VERSION HISTORY:
20070414: - Added a remark that empty cases are processed correctly.
- Removed a prohibitive validity check.
20070323: - Set default value of argument DATATYPE to Categorical.
- Added option to never display status.
- Replaced : with squeeze function.
20070322: - Added support for real numbered attribute values.
20070321: - Original version.
KEYWORDS:
conversion, sparse, spardat, SSD, SVM-Light, SVMLight, SVM, Auton Lab,
machine learning
%}
%% Process input
if nargin<2
help spardat2ssd
return
end
File.In=FileIn;
File.Out=FileOut;
clear FileIn FileOut
if nargin<3
DataType='Categorical';
end
File.DataType=DataType;
clear DataType
if nargin<4
DispInterval=100;
end
%% Open files
File.InID=fopen(File.In,'r');
File.OutID=fopen(File.Out,'w');
%% Main
%Initialize some items
File.InLineCount=0;
File.InLine=[];
File.InLineCommentIndices=[];
File=orderfields(File);
Case.Num=0;
%Main loop
while 1
%Read line of input file
File.InLine=fgetl(File.InID);
File.InLineCount=File.InLineCount+1;
%Exit if reached end of file
if isequal(File.InLine,-1)
if DispInterval
disp(['Processed ',num2str(File.InLineCount),' lines']);
end
fclose('all');
break
end
%Process line (if it is not empty and is not a comment)
File.InLine=strtrim(File.InLine);
if ~isempty(File.InLine) && File.InLine(1)~='#'
Case.Num=Case.Num+1;
%Strip away comment from line if it exists
File.InLineCommentIndices=strfind(File.InLine,'#');
if ~isempty(File.InLineCommentIndices)
File.InLine=File.InLine(1:File.InLineCommentIndices(1)-1);
File.InLine=deblank(File.InLine);
end
%Parse line using reg exps
Case.Targ=regexp(File.InLine,'(?<Targ>^[^ ]+)','names');
Case.Targ=str2double(Case.Targ.Targ);
Case.Atts=regexp(File.InLine,' (?<Att>\d+):','names');
Case.Atts=struct2cell(Case.Atts);
Case.Atts=squeeze(Case.Atts);
Case.Atts=cellfun(@str2double,Case.Atts);
Case.Vals=regexp(File.InLine,':(?<Val>[^ ]+)','names');
Case.Vals=struct2cell(Case.Vals);
Case.Vals=squeeze(Case.Vals);
Case.Vals=cellfun(@str2double,Case.Vals);
%Generate output matrix
File.OutMatrix=matrix4ssd(Case,File.DataType);
%Write to file
switch File.DataType
case 'Categorical'
fprintf(File.OutID,'%1.0f,%1.0f\n',File.OutMatrix);
case 'Real'
fprintf(File.OutID,'%1.0f,%1.0f,%1u\n',File.OutMatrix);
end
end
%Conditonally display status
if DispInterval && ~rem(File.InLineCount,DispInterval)
disp(['Processed ',num2str(File.InLineCount),' lines']);
end
end
%**************************************************************************
%% Subfunction convert2ssd
function[FileOutMatrix]=matrix4ssd(Case,DataType)
%% Check validity of target
if Case.Targ~=-1 && Case.Targ~=0 && Case.Targ~=1
error(['Targets must be -1 or 0, and 1. Case ',num2str(Case.Num),...
' has target ',num2str(Case.Targ),'.'])
end
%% Transform input as relevant
%Decrement case number
Case.Num=Case.Num-1;
%Conditionally transform target
if Case.Targ==-1
Case.Targ=0;
end
%Increment attributes
Case.Atts=Case.Atts+1;
%Conditionally combine target with attributes and values
if Case.Targ==1
Case.Atts=[1;Case.Atts];
Case.Vals=[1;Case.Vals];
end
%% Generate output matrix
Case.NumMatrix=repmat(Case.Num,[1 numel(Case.Atts)]);
switch DataType
case 'Categorical'
FileOutMatrix=[Case.NumMatrix; Case.Atts'];
case 'Real'
FileOutMatrix=[Case.NumMatrix; Case.Atts'; Case.Vals'];
end