Code covered by the BSD License  

Highlights from
NHANESCOPEConvertSASCodeToSTATACodeToReadNHANESDataFile

NHANESCOPEConvertSASCodeToSTATACodeToReadNHANESDataFile

by

 

This NHANESCOPE program converts a SAS input program to a STATA do-file

NHANESCOPEConvertSASCodeToSTATACodeToReadNHANESDataFile(SASCodeFileName,STATADoFileName, STATAFormatDOFileName, NHANESDataFileName)
% NHANESCOPE: National Health and Nutrition Examination Survey Clinical Outcome Prediction Expert
% NHANESCOPEConvertSASCodeToSTATACodeToReadNHANESDataFile
% This is a companion to SCOPE (SEER Clinical Outcome Prediction Expert)
% This program is a part of a software suite developed to analyze NCHS (National Center for Health Statistics) public use NHANES data (National
% Health and Natrition Examination Survey)
% Winter 2013, Main Line, Pennsylvania, USA.
% Dr. Rex Cheung, cheung.r100@gmail.com

%This program has been validated with public use adult.sas (save as adult.txt by a free SAS viewer) and
%adult.dat files. These files could be downloaded from NHANES III website. 

%This program expects 1. the filename of a SAS file provided by NCHS
%NHANES, 2. the filenanme of a STATA 12 do-file, 3. the filename of a STATA do-file to store the formats and 4. the NHANES data file
%name
%Commands to run the script:
%SASCodeFileName='adult.txt' STATADoFileName = 'stataadult1.txt'
%STATADoFileName = 'stataadult1.txt' NHANESDataFileName = 'adult.dat'
% NHANESCOPEConvertSASCodeToSTATACodeToReadNHANESDataFileSuccess = NHANESCOPEConvertSASCodeToSTATACodeToReadNHANESDataFile(SASCodeFileName,STATADoFileName, STATAFormatDOFileName, NHANESDataFileName)

%It has been tested without problems in STATA 12
%Make sure adult.dat is in the STATA working directory
%1. Load the stataadult1.txt, save as a do-file, run it, you will get 1238
%variables and 20050 records (rows)
%2. Load the stataadultformat1.txt, save as a do-file, run it, you will
%have all the variables formatted according to the formatting information
%in the SAS file.
%NOTE the STATA needs to load the data before it can format them

%This program assumes the SAS code from NCHS NHANES website has the following format:
%  ...........
%  LENGTH
%         SEQN      7
%         .....
%         HAZNOK5R  3
%  ;
% 
%  FORMAT
%         DMPPIR   Z6.3
%         .....
%         HAT25MET Z3.2
%  ;
% 
%  INPUT
%         SEQN     1-5
%         ....
%         HAZNOK5R 3345-3346
% ;
% 
% LABEL
%         SEQN     = "Sequence number"
%         ....
%         HAZNOK5R = "Number of BP's used for average K5";
%  ...........

 
function NHANESCOPEConvertSASCodeToSTATACodeToReadNHANESDataFileSuccess = NHANESCOPEConvertSASCodeToSTATACodeToReadNHANESDataFile(SASCodeFileName,STATADoFileName, STATAFormatDOFileName, NHANESDataFileName)

%Read the SAS code file line by line
%-------------- LENGTH ------------------------------
%Under the LENGTH section, search for the '$', this indicates that the variable is a string.  
%Find the keyword 'LENGTH' and the associated delimiter ';', start reading line by line in the LENGTH section.
%Find the variable names associated with the '$', and store the name of variable in a string array VarLengthVarName, 
%and set VarLengthStringIndicator as 1
fid = fopen(SASCodeFileName);
tline = fgetl(fid);
while ischar(tline)
  tline = fgetl(fid);
  [str remain]= strtok(tline); %Note that strtok() with a default delimiter i.e. space will get the first string before first space 
                               %as the str and the remaining into remain, the leading spaces before the first string is ignored
  %In this case, the first (THIS COULD BE MODIFIED FOR ANOTHER NEEDED POSITION) string in tline will be
  %strcmp() with SAS keyword FORMAT (THIS COULD BE MODIFIED FOR ANOTHER
  %KEYWORD. 
    if(strcmp(str,'LENGTH'))
        disp('The targetted SAS word is found, the first word of tline and the remaining tline are:')
        disp(str)
        disp(remain)
  %Proceed to read the next line and get the first word and stored it in str1
        tline = fgetl(fid);
        i=1;
  %Repeat until the end of the desired section of code as delimited by ';'
            [str1 remain1]= strtok(tline);           
        while(str1~=';')   %write out the input variables before the ';' in the SAS code file
            %Create arrays with the same length of inputvarname to store the information
                 
                    VarLengthVarName{i}=str1; %NOTE all variables should  have lengths information, the length of VarLengthVarName is the same as 
                                          %inputvarname
                    String_position = strfind(remain1,'$');                    
                    if(String_position>0) %if the '$' listed in the LENGTH section, then set VarLengthStringIndicator = 0
                        VarLengthStringIndicator(i)=1; 
                    else
                        VarLengthStringIndicator(i)=0;
                    end
                
            tline = fgetl(fid);
            [str1 remain1]= strtok(tline);
  %Increment the variable counter 
           i=i+1;
        end
    end
end
fclose(fid); %Close the SAS file


%------------- INPUT SECTION ------------------------------------------
%Search the SAS file for the keyword INPUT. This section contains the names of the
%input variables, the start and end positions of the NHANES ASCII data.
%NOTE the NHANES ASCII text data are stored in a fixed format. 

%To save the information contained in the SAS code
%Save the variable names, the begin and end 
%positions of the columns in: 1. inputvarname,2.inputvarpositionstart,and 3.inputvarpositionend
fid = fopen(SASCodeFileName);
tline = fgetl(fid); %read line by line
while ischar(tline)
  tline = fgetl(fid);
  [str remain]= strtok(tline); %Note that Matlab strtok() with a default space delimiter. It will get the first string before first space 
                               %as the str and the remaining into remain,
                               %the leading spaces before the string is
                               %ignored by strtok()
                               
  %Use strcmp() to search for SAS keyword INPUT (THIS COULD BE MODIFIED FOR ANOTHER
  %KEYWORD). 
    if(strcmp(str,'INPUT'))
        %for communication
        disp('The targetted SAS word INPUT is found, the first word of tline from strtok():')
        disp(str)
        disp('The remaining tline from strtok():')
        disp(remain)
        
  %Proceed to read the next line 
        tline = fgetl(fid);
        [str1 remain1]= strtok(tline);
 
  %Open the STATA file outputs in this section
        fid1 = fopen(STATADoFileName, 'at'); %'at' argument flag fopen() for appending text
        %Describe in STATA do-File about this section
        
        fprintf(fid1, '%s %s %s \n', '/*This STATA do-file was converted from a SAS program from NHANES ',SASCodeFileName,'*/');
        fprintf(fid1, '%s \n', '/*by a NHANESCOPE utility program NHANESCOPEConvertSASCodeToSTATACodeToReadNHANESDataFile.*/');
        fprintf(fid1, '%s \n', '/*% NHANESCOPE: National Health and Nutrition Examination Survey Clinical Outcome Prediction Expert.*/');

        fprintf(fid1, '%s \n', '   ');
        fprintf(fid1, '%s \n', '# delimit ;');
        fprintf(fid1, '%s \n', 'clear;');
        fprintf(fid1, '%s \n', 'set mem 200m;');
        fprintf(fid1, '%s \n', '   ');
        fprintf(fid1, '%s \n', '   ');
        fprintf(fid1, '%s \n', '   ');
        fprintf(fid1, '%s \n', '/*Input all variables*/');
        fprintf(fid1, '%s \n', 'infix');
  %Repeat until the end of the desired section of code as delimited by ';'
        %Set the counter for the input variable names as 1 to start
        i=1;
        while(str1~=';')   %write out the input variables before the end of INPUT section delimiter ';'
            inputvarname{i}= str1; %store the variable name for later use
 
        %Do a search for '$' that indicates string format, this will need to be
        %converted to 'str' before the variable name. Here '$' will be deleted from remain1 to retain the begin and end column positions
        %to construct the outputline to be written to the STATA Do-file
                if(VarLengthStringIndicator(i)==1)
                    outputline{i} = ['str' '  ' str1 '  '  remain1];
                else
                    outputline{i} = [str1 '  ' remain1];
                end
           
        tline = fgetl(fid);
        [str1 remain1]= strtok(tline);
 
        %Increment the counter
        i=i+1;
        end
        %Write the output as a text file
        for k=1:length(VarLengthVarName)
            fprintf(fid1, '%s \n', outputline{k});
        end
        
        NHANESstr=strcat(NHANESDataFileName,';');
        %Write out the name of the NHANES data file name
        fprintf(fid1, '%s  %s \n', 'using', NHANESstr);
        fprintf(fid1, '%s \n', '   ');
        
  %Close the output file for this section
        fclose(fid1);
    end
end
fclose(fid); %Close the SAS file

%--------------- FORMAT -----------------------------------------------
%Use the variable names stored about, continue here to find the keyword 'FORMAT' and the associated delimiter ';'.
%NOTE SAS FORMAT is specified by Zw.d. Z = preceding zeros present, w = width including the decimals and d = number of decimals.
%If Z is present, store this information in VarFormatLeadingZero by setting it 1. Store w in VarFormatWidth and 
%d in VarFormatNumberDecimals.
fid = fopen(SASCodeFileName);
tline = fgetl(fid);
while ischar(tline)
  tline = fgetl(fid);
  [str remain]= strtok(tline); %Use strcmp() to search for SAS keyword FORMAT 
  %Store the information in these variables
  %1. Store the names of the variables when explicit format is present in VarFormatVarName  2. VarFormatLeadingZero =1 if Z present, 0 if not 
  %3. store the value of w in VarFormatWidth and 4. store the value of d in VarFormatNumberDecimals.
    if(strcmp(str,'FORMAT'))
        disp('The targetted SAS word FORMAT is found, the first word of tline:')
        disp(str)
        disp('The remaining tline are:')
        disp(remain)
  %Proceed to read the next line 
        tline = fgetl(fid);
  
  %Repeat until the end of the desired section of code as delimited by ';'
        [str1 remain1]= strtok(tline); 
        i=1;
        while(str1~=';')   %Write out the information before the ';' in the SAS code file
            
                 
            VarFormatVarName{i}=str1;
            Zposition = strfind(remain1,'Z');  %assuming all format specification has a 'Z'                   
            if(strcmp(remain1(Zposition),'Z'))
                        VarFormatLeadingZero(i)=1;
                        remain1(Zposition)=[];
                        [str2 remain2]=strtok(remain1,'.');
                        remain2(1)=[];
                        VarFormatWidth(i)=str2num(str2);
                        VarFormatNumberDecimals(i)=str2num(remain2);
              
            
            else %No Z detected, these elements will not be assessed
                        VarFormatLeadingZero(i)=0;
                        VarFormatWidth(i)=0;
                        VarFormatNumberDecimals(i)=0;
              
             
            end
        %Proceed to read the next line 
        tline = fgetl(fid);
        [str1 remain1]= strtok(tline);
  
              
  %Increment the variable counter 
        i=i+1;
        end
        
        %STATA requires the variables to be input into the Stata before it could be formatted
        %Open another STATA file for the format outputs in this section
        fid2 = fopen(STATAFormatDOFileName, 'at'); %'at' argument flag fopen() for appending text
        %Describe in STATA do-File about this section
  
  
          %Write the output as a text file
%             disp(i)
%             disp(VarFormatVarName{i})
%             disp(VarFormatLeadingZero(i))
%             disp(VarFormatWidth(i))
%             disp(VarFormatNumberDecimals(i))
        fprintf(fid2, '%s \n', '/*This section derived from SAS format section to format the Stata variables*/');
        fprintf(fid2, '%s \n', ' ');
           
        for i=1:length(VarFormatVarName)
            if(VarFormatLeadingZero(i)==1)
                outStr = strcat('%','0');
                outputline = [outStr num2str(VarFormatWidth(i)) '.' num2str(VarFormatNumberDecimals(i))];
                fprintf(fid2, '%s %s%s %s \n', 'format',outputline,'f',VarFormatVarName{i});
            else
                outStr = strcat('%',num2str(VarFormatWidth(i)));
                outputline = [outStr '.' num2str(VarFormatNumberDecimals(i))];
                fprintf(fid2, '%s %s%s %s \n', 'format', outputline,'f',VarFormatVarName{i});
            end
        fprintf(fid2, '%s \n', ' ');
        end
            tline = fgetl(fid);
            [str1 remain1]= strtok(tline);
  %Increment the variable counter 
  
      
  %Close the output file of this section
        fprintf(fid2, '%s \n', ' ');
        fclose(fid2);
    end
end
fclose(fid); %close the SAS file

%----------- Label ---------------------------------------------
%Find the keyword 'Label' and the associated delimiter ';', start reading line by line.
%Store the variable names in VarLabelVarName and the value of the labels in VarLabelValue
fid = fopen(SASCodeFileName);
tline = fgetl(fid);
while ischar(tline)
  tline = fgetl(fid);
  [str remain]= strtok(tline); %Note that strtok() with a default delimiter i.e. space will get the first string before first space 
                               %as the str and the remaining into remain, the leading spaces before the first string is ignored
  %In this case, the first (THIS COULD BE MODIFIED FOR ANOTHER NEEDED POSITION) string in tline will be
  %strcmp() with SAS keyword FORMAT (THIS COULD BE MODIFIED FOR ANOTHER
  %KEYWORD. 
    if(strcmp(str,'LABEL'))
        disp('The targetted SAS word is found, the first word of tline and the remaining tline are:')
        disp(str)
        disp(remain)
  %Proceed to read the next line and get the first word and stored it in str1
        tline = fgetl(fid);
        
  %Read the next line and save it in the output txt file including the
  %input SAS filename
        fid2 = fopen(STATADoFileName, 'at'); %'at' argument flag fopen for appending text
  %Describe in STATA do-file about this section
        fprintf(fid2, '%s \n', '/* DEFINE VARIABLE LABELS */');

  %Repeat until the end of the desired section of code as delimited by ';'
            [str1 remain1]= strtok(tline);           
        while(str1~=';')   %write out the information before the ';' in the SAS code file
            %Create arrays with the same length of inputvarname to store the information
            VarLabelVarName=cell(1,length(inputvarname)); 
            VarLabelValue=cell(1,length(inputvarname));
            
            %Expands the length of VarLengthVarName to the same length of
            %inputvarname
            for i=1:length(inputvarname)
                if(strcmp(inputvarname{i},str1))
                    VarLabelVarName{i}=str1; %NOTE all variables should  have lengths information, the length of VarLengthVarName is the same as 
                                          %inputvarname
                    [str2 remain2]=strtok(remain1); %Note str2 contains the '=', it is not stored
                    VarLabelValue{i}=remain2;
        
                else %If the variable in the inputvarname list does not need to have a label value, set all the inputs to be '0'
                     %These values will not be used in later labeling
                    VarLabelVarName{i}='0';
                    VarLabelValue{i}='0';
                end
            
            %display the iteration count and the variable name
            %disp(i)
            %disp(inputvarname{i})
            %disp(VarLabelValue{i})
            
                
            tline = fgetl(fid);
            [str1 remain1]= strtok(tline);
  %Increment the variable counter 
             
            end
        %Write the output as a text file
            for j=1:length(inputvarname)
                if(VarLabelValue{j}~='0')
                    outputline = ['label' '   ' 'variable' '   ' VarLabelVarName{j} '   ' VarLabelValue{j}];
                    outStr = strcat(outputline,';');
                    fprintf(fid2, '%s \n', outStr);
                end
            end
      
            
        %Close the output file
        fprintf(fid2, '%s \n', ' ');
        fprintf(fid2, '%s \n', ' ');


        fprintf(fid2, '%s \n', '/* DATA FILE IS STORED IN nhanesData.DTA */');
        fprintf(fid2, '%s \n', 'save nhanesData, replace;');

        fclose(fid2);
        end
    end
end
fclose(fid); %Close the SAS file

NHANESCOPEConvertSASCodeToSTATACodeToReadNHANESDataFileSuccess = 1; %Returns success

  

Contact us