% NHANESCOPE: National Health and Nutrition Examination Survey Clinical Outcome Prediction Expert
% NHANESCOPEConvertSASCodeToSTATACodeToReadNHANESDataFile
% This is a companion to SCOPE (SEER Clinical Outcome Prediction Expert)
% This program is a part of a software suite developed to analyze NCHS (National Center for Health Statistics) public use NHANES data (National
% Health and Natrition Examination Survey)
% Winter 2013, Main Line, Pennsylvania, USA.
% Dr. Rex Cheung, cheung.r100@gmail.com
%This program has been validated with public use adult.sas (save as adult.txt by a free SAS viewer) and
%adult.dat files. These files could be downloaded from NHANES III website.
%This program expects 1. the filename of a SAS file provided by NCHS
%NHANES, 2. the filenanme of a STATA 12 do-file, 3. the filename of a STATA do-file to store the formats and 4. the NHANES data file
%name
%Commands to run the script:
%SASCodeFileName='adult.txt' STATADoFileName = 'stataadult1.txt'
%STATADoFileName = 'stataadult1.txt' NHANESDataFileName = 'adult.dat'
% NHANESCOPEConvertSASCodeToSTATACodeToReadNHANESDataFileSuccess = NHANESCOPEConvertSASCodeToSTATACodeToReadNHANESDataFile(SASCodeFileName,STATADoFileName, STATAFormatDOFileName, NHANESDataFileName)
%It has been tested without problems in STATA 12
%Make sure adult.dat is in the STATA working directory
%1. Load the stataadult1.txt, save as a do-file, run it, you will get 1238
%variables and 20050 records (rows)
%2. Load the stataadultformat1.txt, save as a do-file, run it, you will
%have all the variables formatted according to the formatting information
%in the SAS file.
%NOTE the STATA needs to load the data before it can format them
%This program assumes the SAS code from NCHS NHANES website has the following format:
% ...........
% LENGTH
% SEQN 7
% .....
% HAZNOK5R 3
% ;
%
% FORMAT
% DMPPIR Z6.3
% .....
% HAT25MET Z3.2
% ;
%
% INPUT
% SEQN 1-5
% ....
% HAZNOK5R 3345-3346
% ;
%
% LABEL
% SEQN = "Sequence number"
% ....
% HAZNOK5R = "Number of BP's used for average K5";
% ...........
function NHANESCOPEConvertSASCodeToSTATACodeToReadNHANESDataFileSuccess = NHANESCOPEConvertSASCodeToSTATACodeToReadNHANESDataFile(SASCodeFileName,STATADoFileName, STATAFormatDOFileName, NHANESDataFileName)
%Read the SAS code file line by line
%-------------- LENGTH ------------------------------
%Under the LENGTH section, search for the '$', this indicates that the variable is a string.
%Find the keyword 'LENGTH' and the associated delimiter ';', start reading line by line in the LENGTH section.
%Find the variable names associated with the '$', and store the name of variable in a string array VarLengthVarName,
%and set VarLengthStringIndicator as 1
fid = fopen(SASCodeFileName);
tline = fgetl(fid);
while ischar(tline)
tline = fgetl(fid);
[str remain]= strtok(tline); %Note that strtok() with a default delimiter i.e. space will get the first string before first space
%as the str and the remaining into remain, the leading spaces before the first string is ignored
%In this case, the first (THIS COULD BE MODIFIED FOR ANOTHER NEEDED POSITION) string in tline will be
%strcmp() with SAS keyword FORMAT (THIS COULD BE MODIFIED FOR ANOTHER
%KEYWORD.
if(strcmp(str,'LENGTH'))
disp('The targetted SAS word is found, the first word of tline and the remaining tline are:')
disp(str)
disp(remain)
%Proceed to read the next line and get the first word and stored it in str1
tline = fgetl(fid);
i=1;
%Repeat until the end of the desired section of code as delimited by ';'
[str1 remain1]= strtok(tline);
while(str1~=';') %write out the input variables before the ';' in the SAS code file
%Create arrays with the same length of inputvarname to store the information
VarLengthVarName{i}=str1; %NOTE all variables should have lengths information, the length of VarLengthVarName is the same as
%inputvarname
String_position = strfind(remain1,'$');
if(String_position>0) %if the '$' listed in the LENGTH section, then set VarLengthStringIndicator = 0
VarLengthStringIndicator(i)=1;
else
VarLengthStringIndicator(i)=0;
end
tline = fgetl(fid);
[str1 remain1]= strtok(tline);
%Increment the variable counter
i=i+1;
end
end
end
fclose(fid); %Close the SAS file
%------------- INPUT SECTION ------------------------------------------
%Search the SAS file for the keyword INPUT. This section contains the names of the
%input variables, the start and end positions of the NHANES ASCII data.
%NOTE the NHANES ASCII text data are stored in a fixed format.
%To save the information contained in the SAS code
%Save the variable names, the begin and end
%positions of the columns in: 1. inputvarname,2.inputvarpositionstart,and 3.inputvarpositionend
fid = fopen(SASCodeFileName);
tline = fgetl(fid); %read line by line
while ischar(tline)
tline = fgetl(fid);
[str remain]= strtok(tline); %Note that Matlab strtok() with a default space delimiter. It will get the first string before first space
%as the str and the remaining into remain,
%the leading spaces before the string is
%ignored by strtok()
%Use strcmp() to search for SAS keyword INPUT (THIS COULD BE MODIFIED FOR ANOTHER
%KEYWORD).
if(strcmp(str,'INPUT'))
%for communication
disp('The targetted SAS word INPUT is found, the first word of tline from strtok():')
disp(str)
disp('The remaining tline from strtok():')
disp(remain)
%Proceed to read the next line
tline = fgetl(fid);
[str1 remain1]= strtok(tline);
%Open the STATA file outputs in this section
fid1 = fopen(STATADoFileName, 'at'); %'at' argument flag fopen() for appending text
%Describe in STATA do-File about this section
fprintf(fid1, '%s %s %s \n', '/*This STATA do-file was converted from a SAS program from NHANES ',SASCodeFileName,'*/');
fprintf(fid1, '%s \n', '/*by a NHANESCOPE utility program NHANESCOPEConvertSASCodeToSTATACodeToReadNHANESDataFile.*/');
fprintf(fid1, '%s \n', '/*% NHANESCOPE: National Health and Nutrition Examination Survey Clinical Outcome Prediction Expert.*/');
fprintf(fid1, '%s \n', ' ');
fprintf(fid1, '%s \n', '# delimit ;');
fprintf(fid1, '%s \n', 'clear;');
fprintf(fid1, '%s \n', 'set mem 200m;');
fprintf(fid1, '%s \n', ' ');
fprintf(fid1, '%s \n', ' ');
fprintf(fid1, '%s \n', ' ');
fprintf(fid1, '%s \n', '/*Input all variables*/');
fprintf(fid1, '%s \n', 'infix');
%Repeat until the end of the desired section of code as delimited by ';'
%Set the counter for the input variable names as 1 to start
i=1;
while(str1~=';') %write out the input variables before the end of INPUT section delimiter ';'
inputvarname{i}= str1; %store the variable name for later use
%Do a search for '$' that indicates string format, this will need to be
%converted to 'str' before the variable name. Here '$' will be deleted from remain1 to retain the begin and end column positions
%to construct the outputline to be written to the STATA Do-file
if(VarLengthStringIndicator(i)==1)
outputline{i} = ['str' ' ' str1 ' ' remain1];
else
outputline{i} = [str1 ' ' remain1];
end
tline = fgetl(fid);
[str1 remain1]= strtok(tline);
%Increment the counter
i=i+1;
end
%Write the output as a text file
for k=1:length(VarLengthVarName)
fprintf(fid1, '%s \n', outputline{k});
end
NHANESstr=strcat(NHANESDataFileName,';');
%Write out the name of the NHANES data file name
fprintf(fid1, '%s %s \n', 'using', NHANESstr);
fprintf(fid1, '%s \n', ' ');
%Close the output file for this section
fclose(fid1);
end
end
fclose(fid); %Close the SAS file
%--------------- FORMAT -----------------------------------------------
%Use the variable names stored about, continue here to find the keyword 'FORMAT' and the associated delimiter ';'.
%NOTE SAS FORMAT is specified by Zw.d. Z = preceding zeros present, w = width including the decimals and d = number of decimals.
%If Z is present, store this information in VarFormatLeadingZero by setting it 1. Store w in VarFormatWidth and
%d in VarFormatNumberDecimals.
fid = fopen(SASCodeFileName);
tline = fgetl(fid);
while ischar(tline)
tline = fgetl(fid);
[str remain]= strtok(tline); %Use strcmp() to search for SAS keyword FORMAT
%Store the information in these variables
%1. Store the names of the variables when explicit format is present in VarFormatVarName 2. VarFormatLeadingZero =1 if Z present, 0 if not
%3. store the value of w in VarFormatWidth and 4. store the value of d in VarFormatNumberDecimals.
if(strcmp(str,'FORMAT'))
disp('The targetted SAS word FORMAT is found, the first word of tline:')
disp(str)
disp('The remaining tline are:')
disp(remain)
%Proceed to read the next line
tline = fgetl(fid);
%Repeat until the end of the desired section of code as delimited by ';'
[str1 remain1]= strtok(tline);
i=1;
while(str1~=';') %Write out the information before the ';' in the SAS code file
VarFormatVarName{i}=str1;
Zposition = strfind(remain1,'Z'); %assuming all format specification has a 'Z'
if(strcmp(remain1(Zposition),'Z'))
VarFormatLeadingZero(i)=1;
remain1(Zposition)=[];
[str2 remain2]=strtok(remain1,'.');
remain2(1)=[];
VarFormatWidth(i)=str2num(str2);
VarFormatNumberDecimals(i)=str2num(remain2);
else %No Z detected, these elements will not be assessed
VarFormatLeadingZero(i)=0;
VarFormatWidth(i)=0;
VarFormatNumberDecimals(i)=0;
end
%Proceed to read the next line
tline = fgetl(fid);
[str1 remain1]= strtok(tline);
%Increment the variable counter
i=i+1;
end
%STATA requires the variables to be input into the Stata before it could be formatted
%Open another STATA file for the format outputs in this section
fid2 = fopen(STATAFormatDOFileName, 'at'); %'at' argument flag fopen() for appending text
%Describe in STATA do-File about this section
%Write the output as a text file
% disp(i)
% disp(VarFormatVarName{i})
% disp(VarFormatLeadingZero(i))
% disp(VarFormatWidth(i))
% disp(VarFormatNumberDecimals(i))
fprintf(fid2, '%s \n', '/*This section derived from SAS format section to format the Stata variables*/');
fprintf(fid2, '%s \n', ' ');
for i=1:length(VarFormatVarName)
if(VarFormatLeadingZero(i)==1)
outStr = strcat('%','0');
outputline = [outStr num2str(VarFormatWidth(i)) '.' num2str(VarFormatNumberDecimals(i))];
fprintf(fid2, '%s %s%s %s \n', 'format',outputline,'f',VarFormatVarName{i});
else
outStr = strcat('%',num2str(VarFormatWidth(i)));
outputline = [outStr '.' num2str(VarFormatNumberDecimals(i))];
fprintf(fid2, '%s %s%s %s \n', 'format', outputline,'f',VarFormatVarName{i});
end
fprintf(fid2, '%s \n', ' ');
end
tline = fgetl(fid);
[str1 remain1]= strtok(tline);
%Increment the variable counter
%Close the output file of this section
fprintf(fid2, '%s \n', ' ');
fclose(fid2);
end
end
fclose(fid); %close the SAS file
%----------- Label ---------------------------------------------
%Find the keyword 'Label' and the associated delimiter ';', start reading line by line.
%Store the variable names in VarLabelVarName and the value of the labels in VarLabelValue
fid = fopen(SASCodeFileName);
tline = fgetl(fid);
while ischar(tline)
tline = fgetl(fid);
[str remain]= strtok(tline); %Note that strtok() with a default delimiter i.e. space will get the first string before first space
%as the str and the remaining into remain, the leading spaces before the first string is ignored
%In this case, the first (THIS COULD BE MODIFIED FOR ANOTHER NEEDED POSITION) string in tline will be
%strcmp() with SAS keyword FORMAT (THIS COULD BE MODIFIED FOR ANOTHER
%KEYWORD.
if(strcmp(str,'LABEL'))
disp('The targetted SAS word is found, the first word of tline and the remaining tline are:')
disp(str)
disp(remain)
%Proceed to read the next line and get the first word and stored it in str1
tline = fgetl(fid);
%Read the next line and save it in the output txt file including the
%input SAS filename
fid2 = fopen(STATADoFileName, 'at'); %'at' argument flag fopen for appending text
%Describe in STATA do-file about this section
fprintf(fid2, '%s \n', '/* DEFINE VARIABLE LABELS */');
%Repeat until the end of the desired section of code as delimited by ';'
[str1 remain1]= strtok(tline);
while(str1~=';') %write out the information before the ';' in the SAS code file
%Create arrays with the same length of inputvarname to store the information
VarLabelVarName=cell(1,length(inputvarname));
VarLabelValue=cell(1,length(inputvarname));
%Expands the length of VarLengthVarName to the same length of
%inputvarname
for i=1:length(inputvarname)
if(strcmp(inputvarname{i},str1))
VarLabelVarName{i}=str1; %NOTE all variables should have lengths information, the length of VarLengthVarName is the same as
%inputvarname
[str2 remain2]=strtok(remain1); %Note str2 contains the '=', it is not stored
VarLabelValue{i}=remain2;
else %If the variable in the inputvarname list does not need to have a label value, set all the inputs to be '0'
%These values will not be used in later labeling
VarLabelVarName{i}='0';
VarLabelValue{i}='0';
end
%display the iteration count and the variable name
%disp(i)
%disp(inputvarname{i})
%disp(VarLabelValue{i})
tline = fgetl(fid);
[str1 remain1]= strtok(tline);
%Increment the variable counter
end
%Write the output as a text file
for j=1:length(inputvarname)
if(VarLabelValue{j}~='0')
outputline = ['label' ' ' 'variable' ' ' VarLabelVarName{j} ' ' VarLabelValue{j}];
outStr = strcat(outputline,';');
fprintf(fid2, '%s \n', outStr);
end
end
%Close the output file
fprintf(fid2, '%s \n', ' ');
fprintf(fid2, '%s \n', ' ');
fprintf(fid2, '%s \n', '/* DATA FILE IS STORED IN nhanesData.DTA */');
fprintf(fid2, '%s \n', 'save nhanesData, replace;');
fclose(fid2);
end
end
end
fclose(fid); %Close the SAS file
NHANESCOPEConvertSASCodeToSTATACodeToReadNHANESDataFileSuccess = 1; %Returns success