Code covered by the BSD License  

Highlights from
Sequenom2pedfile

image thumbnail

Sequenom2pedfile

by

 

It creates the pedfile from the genotyping info generated from Sequenom Mass Array runs

Sequenom2pedfile(varargin)
% SEQUENOM2PEDFILE M-file for Sequenom2pedfile.fig
% This script is useful for generating files in linkage format (pedfile and
% mapfile) from the outputs generated from Sequenom MassArray runs

% Authorship: Ismael Huertas  email: ihuertas-ibis@us.es
% Institution: Lab of Movement Disorders, Institute of Biomedicine of Sevilla (IBiS) , Spain.

function varargout = Sequenom2pedfile(varargin)


gui_Singleton = 1;
gui_State = struct('gui_Name',       mfilename, ...
                   'gui_Singleton',  gui_Singleton, ...
                   'gui_OpeningFcn', @Sequenom2pedfile_OpeningFcn, ...
                   'gui_OutputFcn',  @Sequenom2pedfile_OutputFcn, ...
                   'gui_LayoutFcn',  [] , ...
                   'gui_Callback',   []);
if nargin && ischar(varargin{1})
    gui_State.gui_Callback = str2func(varargin{1});
end

if nargout
    [varargout{1:nargout}] = gui_mainfcn(gui_State, varargin{:});
else
    gui_mainfcn(gui_State, varargin{:});
end
% End initialization code - DO NOT EDIT


% --- Executes just before Sequenom2pedfile is made visible.
function Sequenom2pedfile_OpeningFcn(hObject, eventdata, handles, varargin)

%%%%%%%%%%%%%%%%% GLOBAL VARIABLES %%%%%%%%%%%%%%%%%%%

handles.gtable={};       % Matrix for storing all the info 
handles.setgenes={};     % List of genes   
handles.listsnp=[];      % List of SNPs
handles.chr=[];          % Chromosome of each SNP
handles.position=[];     % Position of each SNP   
handles.sample=[];       % Sample ID
handles.snp=[];          % SNP ID
handles.sex=[];          % sex of sample
handles.pheno=[];        % trait value of sample
handles.gene=[];         % Gene of each SNP
handles.dupmenu=1;       % Aux variable for repeated samples option

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% Choose default command line output for Sequenom2pedfile
handles.output = hObject;

% Update handles structure
guidata(hObject, handles);

% UIWAIT makes Sequenom2pedfile wait for user response (see UIRESUME)
% uiwait(handles.figure1);


% --- Outputs from this function are returned to the command line.
function varargout = Sequenom2pedfile_OutputFcn(hObject, eventdata, handles) 
varargout{1} = handles.output;


% It allows the user to select the genes appearing in the output files
function list_of_genes_Callback(hObject, eventdata, handles)
listsnp=[]; % stores the SNPs belonging to these genes 
setgenes=handles.setgenes;
s=get(handles.list_of_genes,'Value');
for m=1:length(s)
   listsnp=[listsnp cell2mat(setgenes(2,s(m)))'];
end
handles.listsnp=listsnp;
 guidata(hObject, handles);



function list_of_genes_CreateFcn(hObject, eventdata, handles)
if ispc && isequal(get(hObject,'BackgroundColor'), get(0,'defaultUicontrolBackgroundColor'))
    set(hObject,'BackgroundColor','white');
end

% It generates a textfile per gene with the genotyping info 
function generate_info_Callback(hObject, eventdata, handles)

% Retrieve global variables
setgenes=handles.setgenes;
gene=handles.gene;
snp=handles.snp;
sample=handles.sample;
gtable=handles.gtable;

% Identify the SNPs of each gene and generate a separated textfile
for k=1:length(setgenes)
   fid = fopen(sprintf('%s.txt',char(setgenes(1,k))), 'w');
   index=cell2mat(setgenes(2,k));
 
   for p=1:length(sample) 
     fprintf(fid, '%s ',char(sample(p)));
     
     for q=index(1):index(end)
        if(isempty(cell2mat(gtable(p+1,q+1))))
         fprintf(fid, '%d ',0);
        else
         fprintf(fid, '%s ',char(gtable(p+1,q+1)));
        end
     end
     fprintf(fid, '\r\n');
   end
   fclose(fid);
  
end
 xlswrite('GENOTYPES.xls',gtable);
 guidata(hObject, handles);

% It generates the pedfile
function generate_pedfile_Callback(hObject, eventdata, handles)

% Retrieve global variables
snp=handles.snp;
sex=handles.sex;
pheno=handles.pheno;
gtable=handles.gtable;
sample=handles.sample;
listsnp=handles.listsnp;

fid = fopen('pedfile.ped', 'w');

if(find(hist(str2num(char(pheno))))>2)
    typetrait=1;
else
    typetrait=0;
end
 
 for p=1:length(sample)
     
     fprintf(fid, '%d ',p);
     
     fprintf(fid, '%s %d %d ',char(sample(p)),0,0);
    
   if(~isnan(char(sex(p,1))))
     fprintf(fid, '%c ',char(sex(p)));
   else
     fprintf(fid, '%d ',0);
   end
     
   if(~isnan(char(pheno(p))))
     fprintf(fid, '%s ',char(pheno(p)));
   else
       switch(typetrait)
           case 0
               fprintf(fid, '%d ',0);
           case 1
               fprintf(fid, '%d ',-9);
       end
     
   end
   
   
     for k=1:length(listsnp)
         q=listsnp(k);
        if(isempty(cell2mat(gtable(p+1,q+1))))
         fprintf(fid, '%d %d ',0,0);
        else
         str=regexp(char(gtable(p+1,q+1)),'.','match');
         fprintf(fid, '%c %c ',str{1},str{2});
        end
     end
     fprintf(fid, '\r\n');
   
 end
    
fclose(fid);

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% It reorganizes the genotype table according to repeated samples across the
% plates. These repetitions allows the control of the concordance.
function load_duplicates_Callback(hObject, eventdata, handles)
% Retrieve global variables
snp=handles.snp;
sex=handles.sex;
age=handles.age;
pheno=handles.pheno;
sample=handles.sample;
gtable=handles.gtable;

% Read textfile containing repeated samples
[txtc pathc]=uigetfile('*.txt','Select textfile with duplicated samples');
fid=fopen(char(strcat(pathc,txtc)));
set(handles.duplicates_pathname,'String', char(strcat(pathc,txtc)));
dup = textscan(fid,'%s%s');

% Load columns to variables
subid = dup{1}(2:end);
samid = dup{2}(2:end);
samex=[];d=1;i = 1;
num_c=zeros(1,length(snp));den_c=zeros(1,length(snp));

% This loop compares the genotypes of the repeated samples for each SNP,
% calculates the concordance rate, and leaves only one entry per sample with
% the most frequent genotype at each SNP
while(i<length(subid))
   sdup={};
   numgn=[];
   
   ind = strmatch(subid(i),subid);
   
   for k=1:length(ind)
   samidx=strmatch(samid(ind(k),:),sample,'exact');
   sdup(k,:)=gtable(samidx+1,:);
   if(k<length(ind))
       samex=[samex samidx+1];
   end
   end

   lista{1,d}=sdup;
   strgen(1,1)=sdup(k,1);
   
   for k=2:size(sdup,2)
        gs=sdup(:,k);
        for q=1:length(gs)
            numgn(q)=sum(cell2mat(regexpi(gs, gs(q))));
        end
        indx= find(numgn==max(numgn));
        if (max(numgn)~=0)
        num_c(1,k-1)=num_c(1,k-1)+max(numgn)-1;
        den_c(1,k-1)=den_c(1,k-1)+length(find(numgn))-1;
        end
        if(length(indx)==max(numgn))
            strgen(1,k)=gs(indx(1));
        else
            strgen(1,k)={''};
        end
     
   end

   gtable(samidx+1,:)=strgen;
   
   i=ind(end)+1;
    
end
% Update gtable matrix dropping out the repetitions
auxvec = 1:size(gtable,1);
auxvec(1,[samex])=0;
ac_indx=find(auxvec);
handles.gtable=gtable(ac_indx,:);
ac_indx=ac_indx-1;
handles.age=age(ac_indx(2:end),1);
handles.sex=sex(ac_indx(2:end),1);
handles.pheno=pheno(ac_indx(2:end),1);
handles.sample=sample(ac_indx(2:end),1);

% Generates Descriptives textfile of the studied sample
fid = fopen('descriptives.txt', 'w');
for j=1:length(handles.sample)
fprintf(fid, '%s %s %s %s \r\n',char(handles.sample(j)),char(handles.sex(j)),char(handles.age(j)),char(handles.pheno(j)));
end
fclose(fid);

% Generates SNP concordance rate textfile
conc = 100*(num_c./den_c)';
fid = fopen('concordance.txt','wt');
fprintf(fid,'%s  %s\n','SNP','Concordance');
for i=1:length(snp)
fprintf(fid,'%s\t',char(snp(i)));
fprintf(fid,'%2.2f%%\n',conc(i));
end
fclose(fid);
guidata(hObject, handles);


function list_GenotypeArea_CreateFcn(hObject, eventdata, handles)
if ispc && isequal(get(hObject,'BackgroundColor'), get(0,'defaultUicontrolBackgroundColor'))
    set(hObject,'BackgroundColor','white');
end


% It loads the genotype info
function load_genotype_data_Callback(hObject, eventdata, handles)

% Load excel files from Sequenom runs
[xlsq path]=uigetfile('*.xls','Select GenotypeArea.xls files output from Sequenom','MultiSelect','on');
set(handles.list_GenotypeArea,'String',xlsq);
if(iscell(xlsq))
    co=length(xlsq);
else
    co=1;
end

% Retrieve needed global variables
sample=handles.sample;
snp=handles.snp;
gtable=handles.gtable;

% gtable will be a table containing all the genotyping info
% Its first row and column are the SNP and sample tags respectively
gtable = cell(length(sample)+1,length(snp)+1);
gtable(1,2:end) = snp;
gtable(2:end,1) = sample;

% Loop reading each GenotypeArea.xls file
for is=1:co
    
    % Distinguish multiple choice and single choice
    if(iscell(xlsq))
       sq=importdata(char(strcat(path,xlsq{is})));
    else
       sq=importdata(char(strcat(path,xlsq)));  
    end
    
    % Load in variables the matrix from Genotypes' sheet
    samplesq=sq.textdata.Genotypes(2:end,1);
    positionsq=sq.textdata.Genotypes(2:end,2);
    namesnp=sq.textdata.Genotypes(1,3:end);
    genotype=sq.textdata.Genotypes(2:end,3:end);
    
    % Run the loaded matrix and fill in the gtable matrix
    for k=1:size(samplesq,1)
        
        samidx = strmatch(samplesq(k),sample,'exact');
        
        if(~isempty(samidx))
            
            for h=1:size(namesnp,2)
                
                snpidx = strmatch(namesnp(h),snp,'exact');
                
                if(~isempty(snpidx))
                    
                    if(~isempty(cell2mat(genotype(k,h)))& isempty(cell2mat(gtable(samidx+1,snpidx+1))))
                        if(length(genotype{k,h})==1)
                            gtable(samidx+1,snpidx+1)=strcat(genotype(k,h),genotype(k,h));
                        else
                            gtable(samidx+1,snpidx+1)=genotype(k,h);
                        end
                    end

                end

            end
           
        end
        

        
    end
    

    
end

%Update Global variables
handles.gtable=gtable;
 guidata(hObject, handles); 

% It reads the samplefile. This file contains information about the samples
% that have been genotyped.
function load_samples_data_Callback(hObject, eventdata, handles)
[txts paths]=uigetfile('*.txt','Select samplefile');
fids=fopen(char(strcat(paths,txts)));
set(handles.samplefile_pathname,'String', char(strcat(paths,txts)));
sam = textscan(fids,'%s%s%s%s');
sample = sam{1}(2:end);
sex = sam{2}(2:end);
age = sam{3}(2:end);
pheno = sam{4}(2:end);

%Update Global variables
handles.sample=sample;
handles.sex=sex;
handles.pheno=pheno;
handles.age=age;
 guidata(hObject, handles);

% It reads the mapfile. This file contains information about the SNPs that
% are going to be read and reported in the pedfile.
function load_mapfile_Callback(hObject, eventdata, handles)
% Load mapile
[txtm pathm]=uigetfile('*.txt','Select mapfile');
fidm=fopen(char(strcat(pathm,txtm)));
set(handles.mapfile_pathname,'String', char(strcat(pathm,txtm)));
% Read mapfile, it assigns each column to a global variable
map = textscan(fidm,'%s%s%s%s');
chr = map{1}(2:end);
snp = map{2}(2:end);
gene = map{3}(2:end);
position = map{4}(2:end);

% Find out what genes are present in the files
setgenes=gene(1);
for k=2:size(gene,1)
  if(isempty(strmatch(gene(k),setgenes,'exact')))
     setgenes(end+1)=gene(k);
  end 
end

% Identify which SNPs belong to each gene
for k=1:length(setgenes)
matches = regexp(char(setgenes(1,k)),gene);
index=find(~cellfun(@isempty,matches));
setgenes{2,k}=index;
end

cc=setgenes(1,:);
set(handles.list_of_genes,'String',cc);

%Update global variables
handles.chr=chr;
handles.snp=snp;
handles.gene=gene;
handles.position=position;
handles.setgenes=setgenes;
guidata(hObject, handles);


% It generates the mapfile
function gen_mapfile_Callback(hObject, eventdata, handles)

% Retrieve global variables
chr = handles.chr;
position = handles.position;
snp = handles.snp;
listsnp = handles.listsnp;

fid = fopen('mapfile.map','w')
for k=1:length(listsnp)
    q=listsnp(k);
    fprintf(fid, '%s %s %d %s\r\n',char(chr(q)),char(snp(q)),0, char(position(q)) );
        
end
fclose(fid);


Contact us