Code covered by the BSD License  

Highlights from
Error-tolerant parsing of newline-delimited data

Error-tolerant parsing of newline-delimited data

by

 

Adaptive parsing of newline-separated data. Handles bad lines WITHOUT reading line-by-line.

adaptive_parse_parse_demo(C,lineNumbers,parseOptions)
function [parsed] = adaptive_parse_parse_demo(C,lineNumbers,parseOptions)
%
% adaptive_parse_parse_demo.m--Parsing function for demonstrating use of
% adaptive_parse().
%
% The "parseOptions" input argument is not used in this demo function, but
% it MUST appear in the function declaration.
%
% Syntax: parsed = adaptive_parse_parse_demo(C,lineNumbers,parseOptions)

% Developed in Matlab 7.11.0.584 (R2010b) on GLNX86
% for the VENUS project (http://venus.uvic.ca/).
% Kevin Bartlett (kpb@uvic.ca), 2011-03-22 10:46
%-------------------------------------------------------------------------

if isempty(C)
    parsed = [];
    return;
end % if

% For the purposes of demonstrating adaptive_parse.m, this parsing function
% must be able to handle a C that is:
%
%   1x7 cell array containing doubles (when contents of file are read in
%   using textscan under format control string),
%
%   Nx1 cell array containing chars (when cell array of data is passed to
%   adaptive_parse as an input argument),
%
%   1x1 cell array containing Nx1 cell (when contents of file are read in
%   using textscan with no format control string specified).
if size(C,2) == 7
    % Data was read in by adaptive_parse.m using textscan under the control
    % of a format string. This lends itself to very efficient processing.
    mtime = datenum(C{1},C{2},C{3},C{4},C{5},C{6});
    C = C{7};
else
    if isa(C{1},'cell')
        % Data was read in by adaptive_parse.m using textscan, but without
        % specifying a format string. Extract the cell array of characters
        % from C so that it can be processed the same way as a cell array
        % that has been passed to adaptive_parse as an input argument.
        C = C{1};
    end % if
    
    % Data was read in as a cell array of characters outside of
    % adaptive_parse.m and passed to it as an input argument. This
    % precludes the use of textscan, and, as a result, the processing is
    % less efficient.
        
    % ...Remove lines that aren't long enough for a full datestamp and
    % 5-character line ID.
    lineLengths = cellfun('length',C);
    longEnoughIndex = lineLengths>=26;
    lineNumbers = lineNumbers(longEnoughIndex);
    C = C(longEnoughIndex);
  
    % Extract the date information from the cell array.
    C = char(C);
    yr = str2num(C(:,1:4));
    mo = str2num(C(:,5:6));
    da = str2num(C(:,7:8));
    hr = str2num(C(:,10:11));
    mi = str2num(C(:,12:13));
    se = str2num(C(:,14:19));
    %parsed.mtime = datenum(yr,mo,da,hr,mi,se);
    mtime = datenum(yr,mo,da,hr,mi,se);
    
    % Convert non-date portion of data back to cell array. This will put
    % the data in the same form as if read in by adaptive_parse.m using
    % textscan. 
    C = cellstr(C(:,22:end));
end % if

% Demonstrate adaptive_parse.m's response to running out of memory. This
% simulates loading an extremely large file.
if size(C,1)>200
   %dummy = peaks(99999999);
end % if

% Make an index to all the lines in C. Will remove elements from it when
% lines are removed from C.
origLength = size(C,1);

% Keep only those lines of sufficient length to parse.
lineLengths = cellfun('length',C);
longEnoughIndex = lineLengths>=5;
C = C(longEnoughIndex);
lineNumbers = lineNumbers(longEnoughIndex);

% Remove all whitespace.
C = regexprep(C, '\s+', '');

% Keep only those lines consisting entirely of hex characters.
nonHexIndex = regexpi(C,'[^a-f0-9]');
nonHexIndex = ~cellfun(@isempty,nonHexIndex);
isHexIndex = setdiff((1:length(C)),find(nonHexIndex));
C = C(isHexIndex);
lineNumbers = lineNumbers(isHexIndex);

% Keep only those lines with recognised line types.
lineIDs = cellfun(@(x) x(1:4), C,'UniformOutput',false);
lineTypes = {'a510' 'a511' 'a512'};
knownTypeIndex = ismember(lineIDs,lineTypes);
C = C(knownTypeIndex);
lineNumbers = lineNumbers(knownTypeIndex);
lineIDs = lineIDs(knownTypeIndex);

% Parse the 'a510' lines.
thisType = 'a510';
thisTypeIndex = ismember(lineIDs,thisType);

if ~isempty(thisTypeIndex)
    
    % If any lines of data are the wrong length, throw an error. This could
    % be done more gracefully in actual use, but it will serve to
    % demonstrate how adaptive_parse deals with bad data lines.
    thisTypeLines = C(thisTypeIndex);
    
    if length(find(cellfun('length',thisTypeLines)~=48))~=0
        %disp('bad length')
        error([mfilename '.m--Data line of wrong length found.']);
    end % if
    
    thisTypeLines = char(C(thisTypeIndex));
    parsed.pressure = (65536/1000)*hex2dec(thisTypeLines(:,9:10)) + hex2dec(thisTypeLines(:,13:14)); % metres
    parsed.pressure_mtime = mtime(thisTypeIndex); 
    parsed.pressure_lineNumbers = lineNumbers(thisTypeIndex); 
end % if

% Parse the 'a511' lines.
thisType = 'a511';
thisTypeIndex = find(ismember(lineIDs,thisType));

if ~isempty(thisTypeIndex)
    thisTypeLines = char(C(thisTypeIndex));
    parsed.temperature = hex2dec([thisTypeLines(:,43:44) thisTypeLines(:,41:42)])./100;
    parsed.temperature_mtime = mtime(thisTypeIndex); 
    parsed.temperature_lineNumbers = lineNumbers(thisTypeIndex); 
end % if

% Parse the 'a512' lines.
thisType = 'a512';
thisTypeIndex = find(ismember(lineIDs,thisType));

if ~isempty(thisTypeIndex)
    thisTypeLines = char(C(thisTypeIndex));
    parsed.noiseAmplitude = hex2dec(thisTypeLines(:,25:26));
    parsed.noiseAmplitude_mtime = mtime(thisTypeIndex); 
    parsed.noiseAmplitude_lineNumbers = lineNumbers(thisTypeIndex); 
end % if

Contact us