function [dtn,spl,tok] = datenum8601(str,tok)
% Convert an ISO 8601 formatted Date String (timestamp) to a Serial Date Number.
%
% (c) 2013 Stephen Cobeldick
%
% ### Function ###
%
% Syntax:
% DateNum = datenum8601(String)
% DateNum = datenum8601(String,Token)
% [DateNum,Split,Token] = datenum8601(...)
%
% By default the function automatically detects any ISO 8601 timestamp
% within the string, or an optional token can be used to restrict the
% timestamp style recognition. The ISO 8601 timestamp style options are:
%
% - Date in ordinal, calendar or week-numbering notation.
% - Basic or Extended format (without/with unit separation characters).
% - Any date-time separator character (with a few exceptions).
% - Full or lower precision (fewer trailing date/time units).
% - Decimal fraction of the trailing unit (decimal places).
%
% These style options are explained in the tables below (see "Timestamps").
%
% The function returns a Serial Date Number, the input string parts
% that are split by the detected timestamp (i.e. before & after the
% timestamp) and the string token showing the detected timestamp style.
%
% Note 1: Undefined month/date/week/day input string values default to one.
% Note 2: Undefined hour/minute/second input string values default to zero.
% Note 3: Auto-detection mode also accepts a mix of basic/extended formats.
% Note 4: Calls undocumented MATLAB functions "datenummx" & "ismembc".
%
% See also DATESTR8601 CLOCK NOW DATENUM DATEVEC DATESTR DATEROUND
%
% ### Examples ###
%
% Examples use the date+time described by the vector [1999,1,3,15,6,48.0568].
%
% datenum8601(datestr8601([1999,1,3,15,6,48.0568],'ymdHMS4'))
% ans = 730123.62972287962
%
% datenum8601('1999-01-03 15:06:48.0568')
% ans = 730123.62972287962
%
% datenum8601('1999003T150648.0568')
% ans = 730123.62972287962
%
% [dtn,spl,tok] = datenum8601('AA1998W537_150648.0568ZZ')
% dtn = 730123.62972287962
% spl = {'AA','ZZ'}
% tok = 'YWD_HMS4'
%
% [dtn,spl,tok] = datenum8601('1999-003T15')
% dtn = 730123.6250
% spl = {'',''}
% tok = '*ynH'
%
% [dtn,spl,tok] = datenum8601('1999-01-03T15','*ymd')
% dtn = 730123.0000
% spl = {'','T15'}
% tok = '*ymd'
%
% ### ISO 8601 Timestamps ###
%
% Input | Basic Format | Extended Format (token prefix '*')
% Date | In/Out | Input Timestamp | In/Out | Input Timestamp
% Notation| Token: | Example: | Token: | Example:
% --------|--------|-----------------|---------|---------------------------
% Ordinal |'ynHMS' |'1999003T150648' |'*ynHMS' |'1999-003T15:06:48'
% --------|--------|-----------------|---------|---------------------------
% Calendar|'ymdHMS'|'19990103T150648'|'*ymdHMS'|'1999-01-03T15:06:48'
% --------|--------|-----------------|---------|---------------------------
% Week |'YWDHMS'|'1998W537T150648'|'*YWDHMS'|'1998-W53-7T15:06:48'
% --------|--------|-----------------|---------|---------------------------
%
% Timestamp may omit trailing units (reduced precision), eg: | Output->Vector:
% --------|--------|-----------------|---------|-----------------|---------------------
% |'Y' |'1999W' |'*Y' |'1999-W' |[1999,1,4,0,0,0]
% --------|--------|-----------------|---------|-----------------|---------------------
% |'ymdH' |'19990103T15' |'*ymdH' |'1999-01-03T15' |[1999,1,3,15,0,0]
% --------|--------|-----------------|---------|-----------------|---------------------
% Date-time separator character can be specified (default='T'), eg:
% --------|--------|-----------------|---------|-----------------|---------------------
% |'yn_HM' |'1999003_1506' |'*yn_HM' |'1999-003_15:06' |[1999,1,3,15,6,0]
% --------|--------|-----------------|---------|-----------------|---------------------
% |'YWD@H' |'1998W537@15' |'*YWD@H' |'1998-W53-7@15' |[1999,1,3,15,0,0]
% --------|--------|-----------------|---------|-----------------|---------------------
% Trailing date/time value can have decimal digits (fraction), eg:
% --------|--------|-----------------|---------|-----------------|---------------------
% |'ynH3' |'1999003T15.113' |'*ynH3' |'1999-003T15.113'|[1999,1,3,15,6,46.80]
% --------|--------|-----------------|---------|-----------------|---------------------
% |'YWD4' |'1998W537.6297' |'*YWD4' |'1998-W53-7.6297'|[1999,1,3,15,6,46.08]
% --------|--------|-----------------|---------|-----------------|---------------------
% |'y10' |'1999.0072047202'|'*y10' |'1999.0072047202'|[1999,1,3,15,6,48.06]
% --------|--------|-----------------|---------|-----------------|---------------------
%
% Note 5: Function does not check for ISO 8601 compliance: user beware!
% Note 6: Date-time separator must not be any of [+-./0123456789:DFHMPRSWYZdmny].
%
% ### Inputs & Outputs ###
%
% Inputs:
% String = String, possibly containing an ISO 8601 timestamp (date & time).
% Token = String token to select the date notation & format (default=any).
%
% Outputs:
% DateNum = Numeric Scalar, the input ISO 8601 timestamp as a Serial Date Number.
% Split = CellOfStrings, the strings before and after the detected timestamp.
% Token = String Token (see tables), the detected date notation & format.
%
% Inputs = (String,Token*)
% Outputs = [DateNum,Split,Token]
% Define "regexp" match string:
if nargin>1 % User requests a specific timestamp.
[mtc,typ] = d8601Usr(tok);
else % Automagically detect timestamp, with any date-time separator.
typ = 0;
mtc = [...
'(\d{4})',... % year
'((-(?=(\d{2,3}|W)))?)',... % -
'(W?)',... % W
'(?(3)(\d{2})?|(\d{2}$|\d{2}(?=(\D|\d{2})))?)',... % week/month
'(?(4)(-(?=(?(3)\d|\d{2})))?)',... % -
'(?(4)(?(3)\d|\d{2})?|(\d{3})?)',... % day of week/month/year
'(?(6)([^\+\-\./0123456789:DFHMPRSWYZdmny](?=\d{2}))?)',... % separator
'(?(7)(\d{2})?)',... % hour
'(?(8)(:(?=\d{2}))?)',... % :
'(?(8)(\d{2})?)',... % minute
'(?(10)(:(?=\d{2}))?)',... % :
'(?(10)(\d{2})?)',... % second
'((\.\d+)?)']; % decimal fraction
% (allows any combination of basic/extended formats)
end
%
assert(ischar(str)&&size(str,1)<2,'Input "str" must be a string')
%
% Extract timestamp tokens, return split strings:
[tkn,spl] = regexp(str,mtc,'tokens','split','once');
%
% Timestamp not found in str:
if isempty(tkn)
tok = '';
dtn = [];
return
end
%
% Lengths of matched tokens:
len = cellfun('length',tkn);
%
% Preallocate Date Vector:
dtv = [0,1,1,0,0,0];
% Convert date & time values to numeric:
idx = [1,4,6,8,10,12];
for m = find(len(idx));
dtv(m) = sscanf(tkn{idx(m)},'%f');
end
%
% Create token of 8601 timestamp (see also "datestr8601"):
if typ==0
typ = 2-len(3)+(len(6)==3); % (must define before fraction & week parsing)
Ext = char(42*any(len([2,5,9,11])==1));
Sep = tkn{7};
if strcmp('T',Sep) % Default date-time separator
tkc = {'YWDHMS','ymdHMS','y*nHMS'};
tok = [Ext,tkc{typ}(0<len([1,4,6,8,10,12]))];
else % Custom date-time separator
tkc = {['YWD',Sep,'HMS'],['ymd',Sep,'HMS'],['y*n',Sep,'HMS']};
tok = [Ext,tkc{typ}(0<len([1,4,6,7,8,10,12]))];
end
Dgt = sprintf('%.0f',len(13)-1);
else
Dgt = '';
end
%
% Convert decimal fraction value:
if 1<len(13)
if typ==2&&m==2 % Month (special case not converted by "datenummx"):
dtv(3) = 1+sscanf(tkn{13},'%f')*(datenummx(dtv+[0,1,0,0,0,0])-datenummx(dtv));
else % All other date or time values (are converted by "datenummx"):
dtv(m) = dtv(m)+sscanf(tkn{13},'%f');
end
tok = [tok,Dgt];
end
%
if typ==1 % Week-numbering vector to ordinal vector:
dtv(3) = dtv(3)+7*dtv(2)-4-mod(datenummx([dtv(1),1,1]),7);
dtv(2) = 1;
end
%
% Convert out-of-range-date-vector to Serial Date Number: Do not remove this!
dtn = datenummx(dtv);
%
% Month zero (special case not converted by "datenummx"):
if 0==dtv(2)
dtn = dtn-31; % Faster.
%dtn = addtodate(dtn,-1,'month'); % Adds rounding error.
end
%
end
%--------------------------------------------------------------------------
function [mtc,typ] = d8601Usr(tok)
% Identify 8601 timestamp notation using user input token.
%
assert(ischar(tok)&&isrow(tok),'Input "tok" must be a string')
%
% Check if extended or basic, check if decimal fraction:
Ext = strncmp('*',tok,1);
DcP = find(~isstrprop(tok,'digit'),1,'last');
Dgt = tok(DcP+1:end);
tok = tok(1+Ext:DcP);
% Identify date-time separator and start of timestamp:
IsT = ismembc(tok,'+-./0123456789:DFHMPRSWYZdmny'); % (presorted)
tkl = sum(IsT);
typ = find([strncmp(tok(IsT),{'YWDHMS','ymdHMS','ynHMS'},tkl),true],1,'first');
switch sum(~IsT)
case 0 % Standard 'T' separator.
Sep = '(T)';
case 1 % User supplied separator.
nxt = strcmp('H',tok([false,~IsT(1:end-1)]));
assert(nxt,'Input token date-time separator position incorrect.')
Sep = ['(',tok(~IsT),')'];
otherwise
error('Input token is not recognized: too many separator chars.')
end
%
% Year and time tokens:
mtc([1,7,8,10,12,13]) = {'(\d{4})',Sep,'(\d{2})',... % year, separator, hour
'(\d{2})','(\d{2})',['(\.\d{',Dgt,'})']}; % minute, second, decimal fraction
% Format tokens (extended/basic):
if Ext
mtc([2,5,9,11]) = {'(-)','(-)','(:)','(:)'};
else
mtc([2,5,9,11]) = {'()','()','()','()'};
end
%
% Date tokens:
switch typ
case 1 % Week
idz = [2,5,7,9,11,13,14];
mtc([3,4,6]) = {'(W)','(\d{2})','(\d{1})'};
case 2 % Calendar
idz = [2,5,7,9,11,13,14];
mtc([3,4,6]) = {'()', '(\d{2})','(\d{2})'};
case 3 % Ordinal
idz = [2,7,9,11,13,14];
mtc([3,4,5,6]) = {'()','()','()','(\d{3})'};
otherwise
error('Input token is not recognized.')
end
%
% Concatenate tokens into "regexp" match token:
mtc(idz(tkl):12+isempty(Dgt)) = {'()'};
mtc = [mtc{:}];
%
end
%----------------------------------------------------------------------End!