image thumbnail
from pdf files search using keywords by harjeet singh
pdf file search using keywords, one can sort files using given keywords for any number of files.

pdfsearch()
function pdfsearch()

% /////////////// automatic pdf search /////////////////////////

% files needed to run this program


%///////// befor using this program download these files in current folder
%///////// fontbox-1.1.0   http://archive.apache.org/dist/pdfbox/1.1.0/
%///////// pdfbox-1.1.0    http://archive.apache.org/dist/pdfbox/1.1.0/
%//////// jdk-7u10         http://jdk7.java.net/download.html


% 1. make testing_folder name directory and copy all pdf files from which
%    you want to make search your files using key words//////////////////

% 2. make another directory named destination_folder to save all mapped
%    files to copy in that (make this folder empty before every search ////

% 3. run pdfsearch.m and enter keywords to be searched from files
%    click on start to begin search //////////////////////////////////////

% created by Er. Harjeet Singh on 11-6-2012
% Rad Innovations Sector 70 Mohali
% 0172-4667778, 9216497778



clear all
close all
clc

%/////////////////////////// GUI command set //////////////////////////////
%//////////////////////////////////////////////////////////////////////////


figure_1 = figure('MenuBar','none','Name','PDF search (hary eye)','NumberTitle','off','Position',[100,100,390,500],'Color',[.5 .5 .5]);


a1 = uipanel('BorderType','etchedin','ForegroundColor','Black','BackgroundColor',[.5 .5 .5],'Units'...
,'characters','Title','controls','Position',[10 14 60 24],'HighlightColor',[0 0 0]);

search_key1 =uicontrol('Parent',a1,'Style','text','String','keyword 1','Position',[5,210,100,30],'BackgroundColor',[.5 .5 .5],...
'FontSize',11,'HorizontalAlignment','Center');

search_key2 =uicontrol('Parent',a1,'Style','text','String','keyword 2','Position',[5,170,100,30],'BackgroundColor',[.5 .5 .5],...
'FontSize',11,'HorizontalAlignment','Center');

search_key3 =uicontrol('Parent',a1,'Style','text','String','keyword 3','Position',[5,130,100,30],'BackgroundColor',[.5 .5 .5],...
'FontSize',11,'HorizontalAlignment','Center');

search_key4 =uicontrol('Parent',a1,'Style','text','String','keyword 4','Position',[5,90,100,30],'BackgroundColor',[.5 .5 .5],...
'FontSize',11,'HorizontalAlignment','Center');

search_key5 =uicontrol('Parent',a1,'Style','text','String','keyword 5','Position',[5,50,100,30],'BackgroundColor',[.5 .5 .5],...
'FontSize',11,'HorizontalAlignment','Center');

search_key6 =uicontrol('Parent',a1,'Style','text','String','keyword 6','Position',[5,10,100,30],'BackgroundColor',[.5 .5 .5],...
'FontSize',11,'HorizontalAlignment','Center');

sel_image_main9 =uicontrol('Style','Pushbutton','String','start search','Position',[52,140,100,30],'BackgroundColor',[.7 .7 .7],...
'FontSize',11,'HorizontalAlignment','Center','CallBack',@start_data_cap);

sel_image_main10 =uicontrol('Style','Pushbutton','String','stop search','Position',[52,90,100,30],'BackgroundColor',[.7 .7 .7],...
'FontSize',11,'HorizontalAlignment','Center','CallBack',@stop_data_cap);


sel_image_main11 =uicontrol('Style','Pushbutton','String','exit','Position',[250,90,100,50],'BackgroundColor',[.8 .5 .5],...
'FontSize',11,'HorizontalAlignment','Center','CallBack',@exit_routine);


world1 =uicontrol('Parent',a1,'Style','Edit','String','IEEE','Position',[130,220,150,20],'BackgroundColor',[1 1 1],...
'FontSize',11,'HorizontalAlignment','Center');

world2 =uicontrol('Parent',a1,'Style','Edit','String','2012','Position',[130,180,150,20],'BackgroundColor',[1 1 1],...
'FontSize',11,'HorizontalAlignment','Center');

world3 =uicontrol('Parent',a1,'Style','Edit','String','','Position',[130,140,150,20],'BackgroundColor',[1 1 1],...
'FontSize',11,'HorizontalAlignment','Center');

world4 =uicontrol('Parent',a1,'Style','Edit','String','','Position',[130,100,150,20],'BackgroundColor',[1 1 1],...
'FontSize',11,'HorizontalAlignment','Center');

world5 =uicontrol('Parent',a1,'Style','Edit','String','','Position',[130,60,150,20],'BackgroundColor',[1 1 1],...
'FontSize',11,'HorizontalAlignment','Center');

world6 =uicontrol('Parent',a1,'Style','Edit','String','','Position',[130,20,150,20],'BackgroundColor',[1 1 1],...
'FontSize',11,'HorizontalAlignment','Center');

searching_text =uicontrol('Style','text','String','Found: 000','Position',[20,40,350,30],'BackgroundColor',[.5 .5 .5],...
'ForegroundColor',[.3 .8 .3],'FontSize',13,'HorizontalAlignment','Center');


drawnow;

found=0;                                    % intinializing found variable to zero
files=dir('testing_folder');                % reading data from testing folder
brk_loop=0;                                 % flag to break loop 

%////////////// call back functions /////////////////////////////////////////


function start_data_cap(varargin)           % call back function for start pushbutoon
    
    set(sel_image_main9,'BackgroundColor',[.9 .3 .3]);      % changing color of start switch to indicate buzy
    drawnow
    k1=get(world1,'String');                                % reding user giving key words to search in pdf files
    k2=get(world2,'String');
    k3=get(world3,'String');
    k4=get(world4,'String');
    k5=get(world5,'String');
    k6=get(world6,'String');
    
    len=1;                                                  % making cell array from different keywords
    if(length(k1)>0)
        key_words{len}=k1;
        len=len+1;
    end
    if(length(k2)>0)
        key_words{len}=k2;
        len=len+1;
    end
    if(length(k3)>0)
        key_words{len}=k3;
        len=len+1;
    end
    if(length(k4)>0)
        key_words{len}=k4;
        len=len+1;
    end
    if(length(k5)>0)
        key_words{len}=k5;
        len=len+1;
    end
    if(length(k6)>0)
        key_words{len}=k6;       
    end
    
        key_words
    
        javaaddpath('pdfbox-1.1.0.jar');                                % intilizing java pdfbox path
        javaaddpath('fontbox-1.1.0.jar');
        import java.io.File;
        import java.io.FileInputStream;
        import java.io.IOException;
        import org.apache.pdfbox.pdmodel.PDDocument.*;
        import org.apache.pdfbox.util.PDFTextStripper.*; 
        import org.apache.fontbox.cmap.CMapParser.*;
        
        pdfdoc = org.apache.pdfbox.pdmodel.PDDocument;
        pdfStripper = org.apache.pdfbox.util.PDFTextStripper;

        found=0;
        if(length(files)>2)
                ranking(1:length(files)-2,1:length(key_words))=0;
                for k=3:length(files)                                   % loop for searching all files in testing folder         
                    current=files(k).name;
                    current1=strcat('testing_folder\',current)
                    current2=strcat('destination_folder\',current);     % address for copying file in destination folder if have keywords   
                    
                    if(brk_loop==1)                                     % break loop if user press stop button in between this loop
                        brk_loop=0;
                        break;
                    end
                     
                    file_read=0;
                    try                                                % try to read file if any problem exists skip that file
                        pdfdoc = pdfdoc.load(current1);
                        file_read=1;
                    end
                    if(file_read==1)
                            en_flag=pdfdoc.isEncrypted;                % checking for encyption key current pdf

                            tf=length(files)-1;
                            cur=k-2;
                            dummy=strcat('Found:',num2str(found),'.   .evaluating:',num2str(cur),'.    .out of:',num2str(tf));
                            set(searching_text,'String',dummy);
                            drawnow

                            if(en_flag==0)
                                
                                file_read=0;
                                try
                                    pdfstr = char(pdfStripper.getText(pdfdoc));
                                    pdfdoc.close()
                                    file_read=1;
                                end
                                if(file_read==1)                                        
                                        for i=1:length(key_words)           % searching for all keywords to be matched                         
                                            current_char=key_words{i};          
                                            positions=strfind(pdfstr,current_char); % finding no of times current keyworld is in file
                                            ranking(k-2,i)=ranking(k-2,i)+length(positions);    % giving ranking to the current page
                                            only_rank=ranking(k-2,:);
                                            only_rank=logical(only_rank>0);
                                            only_rank=sum(only_rank(:));            % adding all logical to find whether all key words include or not
                                            if(only_rank==length(key_words))        % if have all keywords than copy that file in destination folder
                                                display('copied one')
                                                copyfile(current1,current2)
                                                found=found+1;                                
                                                pause(1)
                                            end
                                        end                        
                                end            
                            end
                    end
                end
        end
    set(sel_image_main9,'BackgroundColor',[.7 .7 .7]);
    drawnow
end

function stop_data_cap(varargin) 
    brk_loop=1;
end

function exit_routine(varargin)
    exit
end
    

    


end







Contact us