Code covered by the BSD License  

Highlights from
Sequenom2pedfile

image thumbnail
from Sequenom2pedfile by Ismael Huertas
It creates the pedfile from the genotyping info generated from Sequenom Mass Array runs

Sequenom2pedfile(varargin)
% SEQUENOM2PEDFILE M-file for Sequenom2pedfile.fig
% This script is useful for generating files in linkage format (pedfile and
% mapfile) from the outputs generated from Sequenom MassArray runs

% Authorship: Ismael Huertas  email: ihuertas-ibis@us.es
% Institution: Lab of Movement Disorders, Institute of Biomedicine of Sevilla (IBiS) , Spain.

function varargout = Sequenom2pedfile(varargin)


gui_Singleton = 1;
gui_State = struct('gui_Name',       mfilename, ...
                   'gui_Singleton',  gui_Singleton, ...
                   'gui_OpeningFcn', @Sequenom2pedfile_OpeningFcn, ...
                   'gui_OutputFcn',  @Sequenom2pedfile_OutputFcn, ...
                   'gui_LayoutFcn',  [] , ...
                   'gui_Callback',   []);
if nargin && ischar(varargin{1})
    gui_State.gui_Callback = str2func(varargin{1});
end

if nargout
    [varargout{1:nargout}] = gui_mainfcn(gui_State, varargin{:});
else
    gui_mainfcn(gui_State, varargin{:});
end
% End initialization code - DO NOT EDIT


% --- Executes just before Sequenom2pedfile is made visible.
function Sequenom2pedfile_OpeningFcn(hObject, eventdata, handles, varargin)

%%%%%%%%%%%%%%%%% GLOBAL VARIABLES %%%%%%%%%%%%%%%%%%%

handles.gtable={};       % Matrix for storing all the info 
handles.setgenes={};     % List of genes   
handles.listsnp=[];      % List of SNPs
handles.chr=[];          % Chromosome of each SNP
handles.position=[];     % Position of each SNP   
handles.sample=[];       % Sample ID
handles.snp=[];          % SNP ID
handles.sex=[];          % sex of sample
handles.pheno=[];        % trait value of sample
handles.gene=[];         % Gene of each SNP
handles.dupmenu=1;       % Aux variable for repeated samples option

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% Choose default command line output for Sequenom2pedfile
handles.output = hObject;

% Update handles structure
guidata(hObject, handles);

% UIWAIT makes Sequenom2pedfile wait for user response (see UIRESUME)
% uiwait(handles.figure1);


% --- Outputs from this function are returned to the command line.
function varargout = Sequenom2pedfile_OutputFcn(hObject, eventdata, handles) 
varargout{1} = handles.output;


% It allows the user to select the genes appearing in the output files
function list_of_genes_Callback(hObject, eventdata, handles)
listsnp=[]; % stores the SNPs belonging to these genes 
setgenes=handles.setgenes;
s=get(handles.list_of_genes,'Value');
for m=1:length(s)
   listsnp=[listsnp cell2mat(setgenes(2,s(m)))'];
end
handles.listsnp=listsnp;
 guidata(hObject, handles);



function list_of_genes_CreateFcn(hObject, eventdata, handles)
if ispc && isequal(get(hObject,'BackgroundColor'), get(0,'defaultUicontrolBackgroundColor'))
    set(hObject,'BackgroundColor','white');
end

% It generates a textfile per gene with the genotyping info 
function generate_info_Callback(hObject, eventdata, handles)

% Retrieve global variables
setgenes=handles.setgenes;
gene=handles.gene;
snp=handles.snp;
sample=handles.sample;
gtable=handles.gtable;

% Identify the SNPs of each gene and generate a separated textfile
for k=1:length(setgenes)
   fid = fopen(sprintf('%s.txt',char(setgenes(1,k))), 'w');
   index=cell2mat(setgenes(2,k));
 
   for p=1:length(sample) 
     fprintf(fid, '%s ',char(sample(p)));
     
     for q=index(1):index(end)
        if(isempty(cell2mat(gtable(p+1,q+1))))
         fprintf(fid, '%d ',0);
        else
         fprintf(fid, '%s ',char(gtable(p+1,q+1)));
        end
     end
     fprintf(fid, '\r\n');
   end
   fclose(fid);
  
end
 xlswrite('GENOTYPES.xls',gtable);
 guidata(hObject, handles);

% It generates the pedfile
function generate_pedfile_Callback(hObject, eventdata, handles)

% Retrieve global variables
snp=handles.snp;
sex=handles.sex;
pheno=handles.pheno;
gtable=handles.gtable;
sample=handles.sample;
listsnp=handles.listsnp;

fid = fopen('pedfile.ped', 'w');

if(find(hist(str2num(char(pheno))))>2)
    typetrait=1;
else
    typetrait=0;
end
 
 for p=1:length(sample)
     
     fprintf(fid, '%d ',p);
     
     fprintf(fid, '%s %d %d ',char(sample(p)),0,0);
    
   if(~isnan(char(sex(p,1))))
     fprintf(fid, '%c ',char(sex(p)));
   else
     fprintf(fid, '%d ',0);
   end
     
   if(~isnan(char(pheno(p))))
     fprintf(fid, '%s ',char(pheno(p)));
   else
       switch(typetrait)
           case 0
               fprintf(fid, '%d ',0);
           case 1
               fprintf(fid, '%d ',-9);
       end
     
   end
   
   
     for k=1:length(listsnp)
         q=listsnp(k);
        if(isempty(cell2mat(gtable(p+1,q+1))))
         fprintf(fid, '%d %d ',0,0);
        else
         str=regexp(char(gtable(p+1,q+1)),'.','match');
         fprintf(fid, '%c %c ',str{1},str{2});
        end
     end
     fprintf(fid, '\r\n');
   
 end
    
fclose(fid);

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% It reorganizes the genotype table according to repeated samples across the
% plates. These repetitions allows the control of the concordance.
function load_duplicates_Callback(hObject, eventdata, handles)
% Retrieve global variables
snp=handles.snp;
sex=handles.sex;
age=handles.age;
pheno=handles.pheno;
sample=handles.sample;
gtable=handles.gtable;

% Read textfile containing repeated samples
[txtc pathc]=uigetfile('*.txt','Select textfile with duplicated samples');
fid=fopen(char(strcat(pathc,txtc)));
set(handles.duplicates_pathname,'String', char(strcat(pathc,txtc)));
dup = textscan(fid,'%s%s');

% Load columns to variables
subid = dup{1}(2:end);
samid = dup{2}(2:end);
samex=[];d=1;i = 1;
num_c=zeros(1,length(snp));den_c=zeros(1,length(snp));

% This loop compares the genotypes of the repeated samples for each SNP,
% calculates the concordance rate, and leaves only one entry per sample with
% the most frequent genotype at each SNP
while(i<length(subid))
   sdup={};
   numgn=[];
   
   ind = strmatch(subid(i),subid);
   
   for k=1:length(ind)
   samidx=strmatch(samid(ind(k),:),sample,'exact');
   sdup(k,:)=gtable(samidx+1,:);
   if(k<length(ind))
       samex=[samex samidx+1];
   end
   end

   lista{1,d}=sdup;
   strgen(1,1)=sdup(k,1);
   
   for k=2:size(sdup,2)
        gs=sdup(:,k);
        for q=1:length(gs)
            numgn(q)=sum(cell2mat(regexpi(gs, gs(q))));
        end
        indx= find(numgn==max(numgn));
        if (max(numgn)~=0)
        num_c(1,k-1)=num_c(1,k-1)+max(numgn)-1;
        den_c(1,k-1)=den_c(1,k-1)+length(find(numgn))-1;
        end
        if(length(indx)==max(numgn))
            strgen(1,k)=gs(indx(1));
        else
            strgen(1,k)={''};
        end
     
   end

   gtable(samidx+1,:)=strgen;
   
   i=ind(end)+1;
    
end
% Update gtable matrix dropping out the repetitions
auxvec = 1:size(gtable,1);
auxvec(1,[samex])=0;
ac_indx=find(auxvec);
handles.gtable=gtable(ac_indx,:);
ac_indx=ac_indx-1;
handles.age=age(ac_indx(2:end),1);
handles.sex=sex(ac_indx(2:end),1);
handles.pheno=pheno(ac_indx(2:end),1);
handles.sample=sample(ac_indx(2:end),1);

% Generates Descriptives textfile of the studied sample
fid = fopen('descriptives.txt', 'w');
for j=1:length(handles.sample)
fprintf(fid, '%s %s %s %s \r\n',char(handles.sample(j)),char(handles.sex(j)),char(handles.age(j)),char(handles.pheno(j)));
end
fclose(fid);

% Generates SNP concordance rate textfile
conc = 100*(num_c./den_c)';
fid = fopen('concordance.txt','wt');
fprintf(fid,'%s  %s\n','SNP','Concordance');
for i=1:length(snp)
fprintf(fid,'%s\t',char(snp(i)));
fprintf(fid,'%2.2f%%\n',conc(i));
end
fclose(fid);
guidata(hObject, handles);


function list_GenotypeArea_CreateFcn(hObject, eventdata, handles)
if ispc && isequal(get(hObject,'BackgroundColor'), get(0,'defaultUicontrolBackgroundColor'))
    set(hObject,'BackgroundColor','white');
end


% It loads the genotype info
function load_genotype_data_Callback(hObject, eventdata, handles)

% Load excel files from Sequenom runs
[xlsq path]=uigetfile('*.xls','Select GenotypeArea.xls files output from Sequenom','MultiSelect','on');
set(handles.list_GenotypeArea,'String',xlsq);
if(iscell(xlsq))
    co=length(xlsq);
else
    co=1;
end

% Retrieve needed global variables
sample=handles.sample;
snp=handles.snp;
gtable=handles.gtable;

% gtable will be a table containing all the genotyping info
% Its first row and column are the SNP and sample tags respectively
gtable = cell(length(sample)+1,length(snp)+1);
gtable(1,2:end) = snp;
gtable(2:end,1) = sample;

% Loop reading each GenotypeArea.xls file
for is=1:co
    
    % Distinguish multiple choice and single choice
    if(iscell(xlsq))
       sq=importdata(char(strcat(path,xlsq{is})));
    else
       sq=importdata(char(strcat(path,xlsq)));  
    end
    
    % Load in variables the matrix from Genotypes' sheet
    samplesq=sq.textdata.Genotypes(2:end,1);
    positionsq=sq.textdata.Genotypes(2:end,2);
    namesnp=sq.textdata.Genotypes(1,3:end);
    genotype=sq.textdata.Genotypes(2:end,3:end);
    
    % Run the loaded matrix and fill in the gtable matrix
    for k=1:size(samplesq,1)
        
        samidx = strmatch(samplesq(k),sample,'exact');
        
        if(~isempty(samidx))
            
            for h=1:size(namesnp,2)
                
                snpidx = strmatch(namesnp(h),snp,'exact');
                
                if(~isempty(snpidx))
                    
                    if(~isempty(cell2mat(genotype(k,h)))& isempty(cell2mat(gtable(samidx+1,snpidx+1))))
                        if(length(genotype{k,h})==1)
                            gtable(samidx+1,snpidx+1)=strcat(genotype(k,h),genotype(k,h));
                        else
                            gtable(samidx+1,snpidx+1)=genotype(k,h);
                        end
                    end

                end

            end
           
        end
        

        
    end
    

    
end

%Update Global variables
handles.gtable=gtable;
 guidata(hObject, handles); 

% It reads the samplefile. This file contains information about the samples
% that have been genotyped.
function load_samples_data_Callback(hObject, eventdata, handles)
[txts paths]=uigetfile('*.txt','Select samplefile');
fids=fopen(char(strcat(paths,txts)));
set(handles.samplefile_pathname,'String', char(strcat(paths,txts)));
sam = textscan(fids,'%s%s%s%s');
sample = sam{1}(2:end);
sex = sam{2}(2:end);
age = sam{3}(2:end);
pheno = sam{4}(2:end);

%Update Global variables
handles.sample=sample;
handles.sex=sex;
handles.pheno=pheno;
handles.age=age;
 guidata(hObject, handles);

% It reads the mapfile. This file contains information about the SNPs that
% are going to be read and reported in the pedfile.
function load_mapfile_Callback(hObject, eventdata, handles)
% Load mapile
[txtm pathm]=uigetfile('*.txt','Select mapfile');
fidm=fopen(char(strcat(pathm,txtm)));
set(handles.mapfile_pathname,'String', char(strcat(pathm,txtm)));
% Read mapfile, it assigns each column to a global variable
map = textscan(fidm,'%s%s%s%s');
chr = map{1}(2:end);
snp = map{2}(2:end);
gene = map{3}(2:end);
position = map{4}(2:end);

% Find out what genes are present in the files
setgenes=gene(1);
for k=2:size(gene,1)
  if(isempty(strmatch(gene(k),setgenes,'exact')))
     setgenes(end+1)=gene(k);
  end 
end

% Identify which SNPs belong to each gene
for k=1:length(setgenes)
matches = regexp(char(setgenes(1,k)),gene);
index=find(~cellfun(@isempty,matches));
setgenes{2,k}=index;
end

cc=setgenes(1,:);
set(handles.list_of_genes,'String',cc);

%Update global variables
handles.chr=chr;
handles.snp=snp;
handles.gene=gene;
handles.position=position;
handles.setgenes=setgenes;
guidata(hObject, handles);


% It generates the mapfile
function gen_mapfile_Callback(hObject, eventdata, handles)

% Retrieve global variables
chr = handles.chr;
position = handles.position;
snp = handles.snp;
listsnp = handles.listsnp;

fid = fopen('mapfile.map','w')
for k=1:length(listsnp)
    q=listsnp(k);
    fprintf(fid, '%s %s %d %s\r\n',char(chr(q)),char(snp(q)),0, char(position(q)) );
        
end
fclose(fid);


Contact us