No BSD License  

Highlights from
Robust Data File Reading utility (RDFREAD)

from Robust Data File Reading utility (RDFREAD) by Michael Boldin
Imports data from a comma- or tab-delimitted file.

rdfread(varargin)
function varargout= rdfread(varargin)
%RDFREAD Robust Data Rile Reading (import) utility  M Boldin  October 2004 (v1.1)
%
%   DATA = RDFREAD(FILENAME) Built for comma-delimitted (csv) datafile importing.
%       Will also read tab-delimitted files. Not recommended for space-separated cases.  
%
%       Data may have missing values and character values mixed in the numeric 
%          data columns.  All non-numerics in data lines are changed to NaN.
%          This function will not properly read columns that are entirely characters.
%
%       If the firstline has no numerics, it is assumed to be a header line 
%          with variable names.  In this case a structure class is created
%          with the fields name.varnames & name.data. If there is no
%          headerline, a simple data matrix is created.
%
%       If there is no output argument (nargout=0), the results are put into 
%         base memory using filename as name.
%
%       examples       
%         RDFREAD -- no input args, uses GUI file selection (uigetfiles)
%         RDFREAD(1) -- command line version
%         RDFREAD('file_name','C:\data_path\') -- works (path is optional)
%         xdat= RDFREAD('file_name')  -- results placed in xdat 
%
%        Below is an example of a messy data file that this utilty fucntion can read.
%
%          SECID,DATE,PRC,X1
%          10001,19970131,8.625,1
%          10002,1997215,13,
%          10003,,-15,4
%          1099a,o.0,.T,NaN
%          xd,,.,1.111
%          999,2,3,6
%
%       In this case, the RDFREAD 'data' results are  
%
%        10001     19970131        8.625            1
%        10002      1997215           13          NaN
%        10003          NaN          -15            4
%          NaN          NaN          NaN          NaN
%          NaN          NaN          NaN        1.111
%          999            2            3            6
%
%       and filename.varnames= 
%
%       'SECID'    'DATE'    'PRC'    'X1'
%
%       Note that this function is considerably slower than i/o routines 
%       such as LOAD and DLMREAD because it reads and parses one line
%       at a time, but it works in a much more robust manner as long as you
%       do not need to read in 'character' columns.


%Extra paramters/switches, all set at defaults, but can be changed
outname=''; 
read_lines_limit=10E12;
skip_lines=0;
missing_value=NaN;
blockn=1E4;
make_structure=0;  %use =1 for always structure output, =-1 for never
%Lines with '***' comment flags show where these are used;
%   except missing_value which is used in numerous places;

%check output options;
if(nargout>1);
   error('Too many output arguments, 0 or 1 allowed');
end;

%Setting file selection method parameters;
if(nargin==0);
   path=[];
   fname=[];
   arg1=0;
elseif(varargin{1}==1); 
   arg1=1;
else; 
   fname=varargin{1};
   arg1=-1;
end;
if(nargin>1)
    path=varargin{2}; 
else;
    path=[];
end;

 disp 'Running RDFREAD data file reading utility';
if(arg1==0);
    % GUI method to select file;
    [fname, path] = uigetfile( ...
        { ...
            '*.txt;*.csv;*.dat', 'ASCII Test Data Files'; ...
            '*.*','All Files (*.*)' ...
        },'Pick a file to load/read');
elseif(arg1==1);
    % Command line method -- select file;
    fname=input('Input file name (ex. datafile.txt): ','s');
end;

%Open file and check to see it exists
fullname=[path fname];
[fid, message] = fopen(fullname,'r');
if isempty(fid) || (fid < 0)
    error(['File: ', fname, ' not found (in the MATLAB Path).']);
else,  disp(['Opening file: ', fname])  
end

[fn1 fn2]=strtok(fname,'.');
fext=strrep(fn2,'.','');

%set outname name;
if strcmp(outname,'');    %***;
   outname=fn1;  
end;    

fline='';
fline1=fgetl(fid);

%optional skip lines case
if skip_lines > 0;   %***;
  disp(['Skipping ' num2str(skip_lines) ' lines']);
  for ii=2:skip_lines;
    fline1=fgetl(fid);
  end;
end;  

disp 'First line:';
disp(fline1);

hcomma=strfind(fline1,',');
ss1=strread(fline1,'%s',-1,'delimiter','\t')';
htab=size(ss1,2);

if ~isempty(hcomma), 
    disp 'Comma delimitted will be assumed';
    dlm=',';
    [r n] = size(strfind(fline1,dlm));
    nv=n+1;
elseif htab > 1 
    disp 'Tab delimitted will be assumed';
    dlm='\t';
    nv=htab;
else;
    dlm=' ';
    disp 'WARNING Comma or Tab delimitted datafiles read best, but space-separated assumed';
    [r n] = size(strfind(fline1,dlm));
    nv=n+1;
end;
disp(['Number of data columns: '  num2str(nv)]);

%Count number of numerics in first line;
nv1=0;  
ss1=strread(fline1,'%s',nv,'delimiter',dlm);
for jj=1:nv;
    %vnames(jj)= ( ss1(jj) );
    try;
        x1=str2num(char(ss1(jj)));
    catch;
        x1=[];
    end;
    if isnumeric(x1) && ~isempty(x1);  nv1=nv1+1;  end;
    %increments one if item jj is numeric;
end;

%Determine if variable names in header line
if(nv1==0);
    disp 'First line assumed to be header with variable names';
    disp '    and will be skipped for data read';
    header=1;  
else;
    disp 'First line assumed to be data, not variable names';
    header=0;  
end;

if(header==1);
   fline=fgetl(fid);
   else, fline=fline1;
end;

%Initialize xdat matrix and values for data reading loop;
jjblockn=blockn;
xdat(1:jjblockn,1:nv)= missing_value;
xcheck=0;  jj=0;  keep_reading=1;

%Loop to read data lines
while(keep_reading);
    jj=jj+1;

    %Go through each data file line-- fline is assumed to be read before loop starts
    %disp(fline); %*** uncomment to display each data file line;
    
    try;
        xcheck=0;
        xx=strread(fline,'%f',nv,'delimiter',dlm,'emptyvalue',missing_value);  
    catch;
        xx=[];
        xcheck=1;
        %disp(['Problem with line: ' num2str(jj)]);
    end;
    
[rnv jc]=size(xx);
if(rnv < nv), xcheck=1; end;
    
    if(~isempty(xx) || xcheck==0);
        if(prod(size(xx))==nv-1), xx=[xx; missing_value]; end; 
        xdat(jj,:)=xx';
    else;
        %disp('Reading for str2num(ber) by column');
        xx=strread(fline,'%s',nv,'delimiter',dlm,'emptyvalue',missing_value);        
        for cc=1:nv
            try
                xxcc=str2num(char(xx(cc)));
                if ~isempty(xxcc);
                    xdat(jj,cc)=xxcc;
                else;
                    xdat(jj,cc)=missing_value;
                end;                       
            catch
                xdat(jj,cc)=missing_value;
            end;
        end;
    end;
   
    fline=fgetl(fid); %*** fline used at top of loop
    
    %Check to see if data ends or loop continue
    if(fline==-1);  
        disp('Reached end of file');
        keep_reading=0;  
        break;  
    elseif(jj >= read_lines_limit);  %***;
        disp(['*** Reached limit to rows allowed: ' num2str(jj)]);
        break;
    elseif jj== jjblockn;
       %*** Rows added to xdat block to improve the loop speed 
       xdat(jjblockn+1:jjblockn+blockn,1:nv)= missing_value;
       jjblockn=jjblockn+blockn;
       %[rows columns]=size(xdat);
       %disp(['At row ' num2str(jj) ' Rows in xdat ' num2str(rows)]);
    end;
    
    
end;

xdat=xdat(1:jj,:);
[rows columns]=size(xdat);
disp(['Number of rows sucessfully read: ' num2str(rows)]);
disp(['Number of columns in data: ' num2str(columns)]);


%*** Output steps;

if (make_structure==1) || ( (make_structure==0) && (header==1) );  %***;
    xdat=struct('data',xdat);
    if (header==1);
      xdat.varnames=cellstr(char(ss1))';
    end;  
    disp('Creating structure with varnames and data fields');  
elseif (make_structure==-1) || (header==0);
    disp('Results placed in data matrix');  
end;

if nargout==0;  %***
    %Place results into base memory
    assignin('base',outname,xdat);
    disp(['See ' outname]);
elseif nargout==1;
   %Use varargout
    varargout{1}=xdat;
end;

disp('***Done***');

Contact us at files@mathworks.com