% NHANESCOPE: National Health and Nutrition Examination Survey Clinical Outcome Prediction Expert
% This is a companion to SCOPE (SEER Clinical Outcome Prediction Expert)
% This program is a part of a software suite developed to analyze NCHS (National Center for Health Statistics) public use NHANES data (National
% Health and Natrition Examination Survey)
% Winter 2013, Main Line, Pennsylvania, USA.
% Dr. Rex Cheung, cheung.r100@gmail.com
%This program has been validated with public use adult.sas (save as adult.txt by a free SAS viewer) and
%adult.dat files. These files could be downloaded from NHANES III website. This program has successfully read the first 10 lines (1238 variables of
%approximately 20,000 records (rows)), loaded them into a Matlab dataset object, associated with variables the variable names and
%descriptions of the variables. The loaded data set should be sufficient for initial exploratory
%purposes. This program has left out some specific formatting information available in the SAS file such as the width and precision of the
%variables for display. However, the relevant information have been collected in the FORMAT and LENGTH sections and could be used for that.
%Please contact me if you have any suggestions and comments.
%This program expects 1. the filename of a SAS file provided by NCHS NHANES and 2. the filenanme of
%the public use NHANES data, they could be downloaded from NHANES III website
%This program assumes the SAS code from NCHS NHANES website has the following format:
% ...........
% LENGTH
% SEQN 7
% .....
% HAZNOK5R 3
% ;
%
% FORMAT
% DMPPIR Z6.3
% .....
% HAT25MET Z3.2
% ;
%
% INPUT
% SEQN 1-5
% ....
% HAZNOK5R 3345-3346
% ;
%
% LABEL
% SEQN = "Sequence number"
% ....
% HAZNOK5R = "Number of BP's used for average K5";
% ...........
function NHANESToReadSASDataIntoMatlabSuccess = NHANESCOPEUseSASCodeToReadNHANESDataFileToMatlab(SASCodeFileName,NHANESDataFileName)
%Read the SAS code file line by line
%------------- INPUT SECTION ------------------------------------------
%Search the SAS file for the keyword INPUT. This section contains the names of the
%input variables, the start and end positions of the NHANES ASCII data.
%NOTE the NHANES ASCII text data are stored in a fixed format.
%To save the information contained in the SAS code
%Save the variable names, the begin and end
%positions of the columns in: 1. inputvarname,2.inputvarpositionstart,and 3.inputvarpositionend
fid = fopen(SASCodeFileName);
tline = fgetl(fid); %read line by line
while ischar(tline)
tline = fgetl(fid);
[str remain]= strtok(tline); %Note that Matlab strtok() with a default space delimiter. It will get the first string before first space
%as the str and the remaining into remain,
%the leading spaces before the string is
%ignored by strtok()
%Use strcmp() to search for SAS keyword INPUT (THIS COULD BE MODIFIED FOR ANOTHER
%KEYWORD).
if(strcmp(str,'INPUT'))
%for communication
disp('The targetted SAS word INPUT is found, the first word of tline from strtok():')
disp(str)
disp('The remaining tline from strtok():')
disp(remain)
%Proceed to read the next line
tline = fgetl(fid);
%Create the filename for outputs in this section
tempfilename1 = strcat(SASCodeFileName,'InputVariableFile.txt');
fid1 = fopen(tempfilename1, 'at'); %'at' argument flag fopen() for appending text
%Set the counter i for the storage array for the variable names
i = 1;
%Repeat until the end of the desired section of code as delimited by ';'
[str1 remain1]= strtok(tline);
%Do a search for '$' that indicates string format, this will be
%stored in LENGTH section. Here '$' will be deleted from remain1 to focus on the
%begin and end column positions
strSAS=strfind(remain1,'$');
if(strSAS>0)
remain1(strSAS)=[];
end
while(str1~=';') %write out the input variables before the end of INPUT section delimiter ';'
inputvarname{i}= str1; %store the variable name
%Check if remain1 has a '-' in between the first and last column
%positions e.g. 1-5
if(strfind(remain1,'-')>0) %NOTE strfind() returns the position of the search key, in this case '-'
[str2 remain2]=strtok(remain1,'-');
remain2(1)=[]; %remove the '-' and keep the last column position
inputvarpositionstart(i)=str2num(str2);
inputvarpositionend(i) = str2num(remain2);
else
[str2 remain2]=strtok(remain1);
inputvarpositionstart(i)=str2num(str2);
inputvarpositionend(i) = str2num(str2);
end
%Write the output as a text file
fprintf(fid1, '%s \n', inputvarname{i});
fprintf(fid1, '%d \n', inputvarpositionstart(i));
fprintf(fid1, '%d \n', inputvarpositionend(i));
tline = fgetl(fid);
[str1 remain1]= strtok(tline);
%Do a search for '$' that indicates string format, this will be
%stored in LENGTH section. Here '$' will be deleted from remain1 to focus on the
%begin and end column positions
strSAS=strfind(remain1,'$');
if(strSAS>0)
remain1(strSAS)=[];
end
%Increment the variable counter
i=i+1;
end
%Close the output file for this section
fclose(fid1);
end
end
fclose(fid); %Close the SAS file
%--------------- FORMAT -----------------------------------------------
%Use the variable names stored about, continue here to find the keyword 'FORMAT' and the associated delimiter ';'.
%NOTE SAS FORMAT is specified by Zw.d. Z = preceding zeros present, w = width including the decimals and d = number of decimals.
%If Z is present, store this information in VarFormatLeadingZero by setting it 1. Store w in VarFormatWidth and
%d in VarFormatNumberDecimals.
fid = fopen(SASCodeFileName);
tline = fgetl(fid);
while ischar(tline)
tline = fgetl(fid);
[str remain]= strtok(tline); %Use strcmp() to search for SAS keyword FORMAT
%Store the information in these variables
%1. Store the names of the variables when explicit format is present in VarFormatVarName 2. VarFormatLeadingZero =1 if Z present, 0 if not
%3. store the value of w in VarFormatWidth and 4. store the value of d in VarFormatNumberDecimals.
if(strcmp(str,'FORMAT'))
disp('The targetted SAS word FORMAT is found, the first word of tline:')
disp(str)
disp('The remaining tline are:')
disp(remain)
%Proceed to read the next line
tline = fgetl(fid);
%Create the output file name for FORMAT outputs
tempfilename2 = strcat(SASCodeFileName,'VariableFormatFile.txt');
fid2 = fopen(tempfilename2, 'at'); %'at' argument flag fopen for appending text
%Repeat until the end of the desired section of code as delimited by ';'
[str1 remain1]= strtok(tline);
while(str1~=';') %Write out the information before the ';' in the SAS code file
%Create arrays with the same length of inputvarname to store the information
VarFormatVarName=cell(length(inputvarname),1);
VarFormatLeadingZero=zeros(length(inputvarname));
VarFormatWidth=zeros(length(inputvarname));
VarFormatNumberDecimals=zeros(length(inputvarname));
%Expands the length of VarFormatVarName to the same length of
%inputvarname
for i=1:length(inputvarname)
if(strcmp(inputvarname{i},str1))
VarFormatVarName{i}=str1;
Zposition = strfind(remain1,'Z'); %assuming all format specification has a 'Z'
if(strcmp(remain1(Zposition),'Z'))
VarFormatLeadingZero(i)=1;
remain1(Zposition)=[];
[str2 remain2]=strtok(remain1,'.');
remain2(1)=[];
VarFormatWidth(i)=str2num(str2);
VarFormatNumberDecimals(i)=str2num(remain2);
else %No Z detected, these elements will not be assessed
VarFormatLeadingZero(i)=0;
VarFormatWidth(i)=0;
VarFormatNumberDecimals(i)=0;
end
else %If the variable in the inputvarname list does not need to have a format, set all the inputs to be '0'
%These values will not be used in later formating
VarFormatVarName{i}='0';
VarFormatLeadingZero(i)=0;
VarFormatWidth(i)=0;
VarFormatNumberDecimals(i)=0;
end
end
%Write the output as a text file
% disp(i)
% disp(VarFormatVarName{i})
% disp(VarFormatLeadingZero(i))
% disp(VarFormatWidth(i))
% disp(VarFormatNumberDecimals(i))
fprintf(fid2, '%s \n', VarFormatVarName{i});
fprintf(fid2, '%d \n', VarFormatLeadingZero(i));
fprintf(fid2, '%d \n', VarFormatWidth(i));
fprintf(fid2, '%d \n', VarFormatNumberDecimals(i));
tline = fgetl(fid);
[str1 remain1]= strtok(tline);
%Increment the variable counter
i=i+1;
end
%Close the output file of this section
fclose(fid2);
end
end
fclose(fid); %close the SAS file
%-------------- LENGTH ------------------------------
%Under the LENGTH section, search for the '$', this indicates that the variable is a string.
%Find the keyword 'LENGTH' and the associated delimiter ';', start reading line by line in the LENGTH section.
%Find the variable names associated with the '$', and store the name of variable in a string array VarLengthVarName,
%and set VarLengthStringIndicator as 1
fid = fopen(SASCodeFileName);
tline = fgetl(fid);
while ischar(tline)
tline = fgetl(fid);
[str remain]= strtok(tline); %Note that strtok() with a default delimiter i.e. space will get the first string before first space
%as the str and the remaining into remain, the leading spaces before the first string is ignored
%In this case, the first (THIS COULD BE MODIFIED FOR ANOTHER NEEDED POSITION) string in tline will be
%strcmp() with SAS keyword FORMAT (THIS COULD BE MODIFIED FOR ANOTHER
%KEYWORD.
if(strcmp(str,'LENGTH'))
disp('The targetted SAS word is found, the first word of tline and the remaining tline are:')
disp(str)
disp(remain)
%Proceed to read the next line and get the first word and stored it in str1
tline = fgetl(fid);
%Read the next line and save it in the output txt file including the
%input SAS filename
tempfilename3 = strcat(SASCodeFileName,'VariableLENGTHFile.txt');
fid2 = fopen(tempfilename3, 'at'); %'at' argument flag fopen for appending text
%Repeat until the end of the desired section of code as delimited by ';'
[str1 remain1]= strtok(tline);
while(str1~=';') %write out the input variables before the ';' in the SAS code file
%Create arrays with the same length of inputvarname to store the information
VarLengthVarName=cell(length(inputvarname),1);
VarLengthStringIndicator=zeros(length(inputvarname));
%Expands the length of VarLengthVarName to the same length of
%inputvarname
for i=1:length(inputvarname)
if(strcmp(inputvarname{i},str1))
VarLengthVarName{i}=str1; %NOTE all variables should have lengths information, the length of VarLengthVarName is the same as
%inputvarname
String_position = strfind(remain1,'$');
if(String_position>0) %if the '$' listed in the LENGTH section, then set VarLengthStringIndicator = 0
VarLengthStringIndicator(i)=1;
else
VarLengthStringIndicator(i)=0;
end
else %If the variable in the inputvarname list does not need to have a format, set all the inputs to be '0'
%These values will not be used in later formating
VarLengthVarName{i}='0';
VarLengthStringIndicator(i)=0;
end
%Write the output as a text file
fprintf(fid2, '%s \n', VarLengthVarName{i});
fprintf(fid2, '%d \n', VarLengthStringIndicator(i));
tline = fgetl(fid);
[str1 remain1]= strtok(tline);
%Increment the variable counter
end
end
%Close the output file of this section
fclose(fid2);
end
end
fclose(fid); %Close the SAS file
%----------- Label ---------------------------------------------
%Find the keyword 'Label' and the associated delimiter ';', start reading line by line.
%Store the variable names in VarLabelVarName and the value of the labels in VarLabelValue
fid = fopen(SASCodeFileName);
tline = fgetl(fid);
while ischar(tline)
tline = fgetl(fid);
[str remain]= strtok(tline); %Note that strtok() with a default delimiter i.e. space will get the first string before first space
%as the str and the remaining into remain, the leading spaces before the first string is ignored
%In this case, the first (THIS COULD BE MODIFIED FOR ANOTHER NEEDED POSITION) string in tline will be
%strcmp() with SAS keyword FORMAT (THIS COULD BE MODIFIED FOR ANOTHER
%KEYWORD.
if(strcmp(str,'LABEL'))
disp('The targetted SAS word is found, the first word of tline and the remaining tline are:')
disp(str)
disp(remain)
%Proceed to read the next line and get the first word and stored it in str1
tline = fgetl(fid);
%Read the next line and save it in the output txt file including the
%input SAS filename
tempfilename4 = strcat(SASCodeFileName,'VariableLabelValueFile.txt');
fid2 = fopen(tempfilename4, 'at'); %'at' argument flag fopen for appending text
%Repeat until the end of the desired section of code as delimited by ';'
[str1 remain1]= strtok(tline);
while(str1~=';') %write out the information before the ';' in the SAS code file
%Create arrays with the same length of inputvarname to store the information
VarLabelVarName=cell(1,length(inputvarname));
VarLabelValue=cell(1,length(inputvarname));
%Expands the length of VarLengthVarName to the same length of
%inputvarname
for i=1:length(inputvarname)
if(strcmp(inputvarname{i},str1))
VarLabelVarName{i}=str1; %NOTE all variables should have lengths information, the length of VarLengthVarName is the same as
%inputvarname
[str2 remain2]=strtok(remain1); %Note str2 contains the '=', it is not stored
VarLabelValue{i}=remain2;
else %If the variable in the inputvarname list does not need to have a label value, set all the inputs to be '0'
%These values will not be used in later labeling
VarLabelVarName{i}='0';
VarLabelValue{i}='0';
end
%display the iteration count and the variable name
%disp(i)
%disp(inputvarname{i})
%disp(VarLabelValue{i})
%Write the output as a text file
fprintf(fid2, '%s \n', VarLabelVarName{i});
fprintf(fid2, '%s \n', VarLabelValue{i});
tline = fgetl(fid);
[str1 remain1]= strtok(tline);
%Increment the variable counter
end
%Close the output file
fclose(fid2);
end
end
end
fclose(fid); %Close the SAS file
%------------- Loading the NHANES data into Matlab workspace ------------
%Open the NHANES ASCII data (text) file in Matlab using %s and \n to
%convert the data into one block of text, and with the begin and end column
%information, the data could be recovered from the fixed format data.
%NOTE NHANES data are in ASCII text file, for numeric data, str2double is
%needed
fileID = fopen(NHANESDataFileName);
data = textscan(fileID, '%s', 'delimiter', '\n');
fclose(fileID);
data = data{1}; %NOTE data is 1x1 cell, datat{1} is a Nx1 cell
%Use the variable names stored in inputvarname, and the corresponding begin
%and end positions
%var = cell(length(data),length(inputvarname)); %Commented out for debugging
for m=1:10 %length(data) %Uncomment for loading actual data
for k=1:length(inputvarname)
try
str_element=data{m}(inputvarpositionstart(k):inputvarpositionend(k));
if isstrprop(str_element, 'wspace')
var(m,k)=NaN; %NaN is acceptable for assignment to a double array, without this statement, an subscipt mismatch
%error will be thrown becuase of white spaces
else
var(m,k) = str2num(str_element);
end
catch err
disp(err)
disp(k);
end
end
end
%Combine all the data in a column into one variable, there are
%length(inputvarname) variables
%For debug
%NOTE var is m records x length(inputvarname). Here, m set to 10. For
%actual data, the relevant areas above need to be adjusted to import all
%records
%disp(var);
%------Format the variable in Matlab into a dataset ready for analysis --
%Organize the variables, with its formatting and description using Matlab
%dataset() from its statistic toolbox
%Check the lengths of the formattiing variables for debugging
%All these lengths should be the same
r=length(inputvarname);
disp('The number of input variables are:')
disp(r)
s=length(VarFormatVarName);
disp('The length of VarFormatVarName:')
disp(s)
t=length(VarLengthVarName);
disp('The length of VarLengthVarName:')
disp(t)
%Assuming the start as NHANES SEQN. All NHANES files have a unique sequence
%numbers. Note that NHANES files could have multiple records for each
%sequence number.
%Construct a dataset array as follows:
for m=1:length(inputvarname)
tempVarName{m} = {var(:,m),inputvarname{m}};
end
ds = dataset(tempVarName{:}); %NOTE use tempVarName{:} instead of tempVarName, otherwise, dataset() will output a 1x1 cell ds
%but with the explicit cell
%deferencing,ds is 10x1238 as expected
%with each column labeled correctly.
%Attach the description of the variables to the database object
% tempVarLabel = cell(length(inputvarname),1);
ds.Properties.VarDescription = VarLabelValue; %NOTE in this case, VarLabelValue should not be cell deferenced with {:}
%since in this case, the
%{....} is expected.
%Process the loaded Matlab variables using the formating information from
%SAS file collected.
NHANESToReadSASDataIntoMatlabSuccess = 1; %Returns success