function gutenberg_pages
% GUTENBERG_PAGES downloads from http://www.gutenberg.org/robot/harvest
% the web page addresses of e-text and audio-books. It is meant to be
% used before functions gutenberg_download_mp3 (which downloads mp3 files
% from Gutenberg Project) and/or gutenberg_download_zip (which downloads
% zip files from Gutenberg Project).
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% EXAMPLE:
% gutenberg_pages
% gutenberg_download_mp3
% gutenberg_download_zip
%
% DOWNLOADING ALL THE ARCHIVE MIGHT TAKE --> SEVERAL DAYS <--
% AFTER SUCH A HUGE DOWNLOAD --> HUNDREDS OF GYGABYTES <-- PLEASE KEEP
% ONE COPY FOR YOU AND GIVE ONE TO A FRIEND, OR A LIBRARY, OR A SCHOOL,
% OR A PASSER-BY.
% CONTRIBUTE TO DISTRIBUTE.
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% IMPORTANT: PLEASE, REMEMBER THAT GUTENBERG DVDS CAN BE DOWNLOADED FOR
% FREE FROM:
% http://www.gutenberg.org/wiki/Gutenberg:The_CD_and_DVD_Project
%
% THE DVDS DO NOT CONTAIN THE MP3 FILES
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% YOU CAN GET AN OFFLINE VERSION OF THE PROJECT GUTENBER WEB SITE:
% http://www.gutenberg.org/wiki/Gutenberg:Feeds
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% YOU CAN GET THE PROJECT GUTENBERG CATALOG DATA:
% http://www.gutenberg.org/wiki/Gutenberg:Feeds
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% IMPORTANT: PLEASE, BEFORE YOU USE THIS FUNCTION, CAUSING A HEAVY LOAD
% FOR GUTENBERG SERVERS (AND SLOWING DOWN A LOT OF OTHER VISITORS OF THE
% PROJECT), CONSIDER USING THE FILE THAT IS STORED IN THE PACKAGE OF THIS
% FUNCTION AND THAT IS UPDATED TO THE 7th of January 2009. IN ORDER TO DO
% SO:
% load('gutenberg_files', 'file_name_mp3', 'file_address_mp3', 'file_n_mp3', 'file_name_zip', 'file_address_zip', 'file_n_zip');
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% Output is saved in the file gutenberg_files.mat where the following
% variables are stored:
%
% file_n_mp3: number of mp3 files to be downloaded
% file_name_mp3: name of mp3 files to be downloaded
% file_address_mp3: internet address of mp3 files to be downloaded
% file_n_zip: number of zip files to be downloaded
% file_name_zip: name of zip files to be downloaded
% file_address_zip: internet address of zip files to be downloaded
% webpage_n: number of Gutenberg web-pages where the lists are stored
% webpage_address: internet address of Gutenberg lists
% webpage_contents: HTML source of Gutenberg webpages of lists
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% This is a very rudimental way for downloading files from the Project
% Gutenberg (http://www.gutenberg.org).
% If you know of a more elegant/efficient way to obtain the same result
% and you want to share I would greatly appreciate it. If you can show me
% how to do the same (or better) in a different programming language (C,
% Java, etc.), I will greatly appreciate it.
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% %
% Author: Francesco Pozzi %
% E-Mail: francesco.pozzi@anu.edu.au %
% Date: 7 January 2009 %
% %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Number of files to be downloaded:
file_n_mp3 = 0;
file_n_zip = 0;
% Number of web pages containing the lists:
webpage_n = 1;
% Start the timer
tic;
% The first web page from which to start is:
% http://www.gutenberg.org/robot/harvest
% which contains the list of web page addresses we wish to download:
webpage_address{webpage_n} = 'http://www.gutenberg.org/robot/harvest';
% Let's read the contents of this page:
webpage_contents{webpage_n} = urlread(webpage_address{webpage_n});
% Each HTML hyperlink must follow the characters '<p><a href="'. So this is
% the text we want to find:
text_to_be_found = '<p><a href="';
text_to_be_found_length = length(text_to_be_found);
% We are just starting
end_of_search = 0;
while 1
% We start from the start ...
finish_here = 0;
% Keep track of each hyperlink:
while 1
start_from_here = finish_here;
% Check whether the text coincides or not:
ctrl = 0;
while ctrl == 0
try
% Look for the first "<" symbol you can find
start_from_here = start_from_here + find(webpage_contents{webpage_n}((start_from_here + 1):end) == '<', 1, 'first');
% Check if the following characters are '<p><a href="'
ctrl = all(webpage_contents{webpage_n}(start_from_here:(start_from_here + text_to_be_found_length - 1)) == text_to_be_found);
catch
% If no link can be found at all, we'll reach the end of the webpage without knowing where else to go ...
end_of_search = 1;
break;
end
end
if end_of_search, break, end
% Let's find the last character, i.e. the character '"' which "closes" the HTML hyperlink.
if webpage_contents{webpage_n}(start_from_here) == '<';
finish_here = start_from_here + text_to_be_found_length + find(webpage_contents{webpage_n}((start_from_here + text_to_be_found_length):end) == '"', 1, 'first') - 2;
end
if webpage_contents{webpage_n}(start_from_here + text_to_be_found_length) == '/'
% Let's keep track of all files we wish to download
temp = webpage_contents{webpage_n}((start_from_here + text_to_be_found_length):finish_here);
if all(temp((end-2):end) == 'mp3') || all(temp((end-2):end) == 'MP3')
file_n_mp3 = file_n_mp3 + 1;
file_address_mp3{file_n_mp3} = strcat('http://www.gutenberg.org', temp);
file_name_mp3{file_n_mp3} = file_address_mp3{file_n_mp3}((find(file_address_mp3{file_n_mp3} == '/', 1, 'last') + 1):end);
elseif all(temp((end-2):end) == 'zip') || all(temp((end-2):end) == 'ZIP')
file_n_zip = file_n_zip + 1;
file_address_zip{file_n_zip} = strcat('http://www.gutenberg.org', temp);
file_name_zip{file_n_zip} = file_address_zip{file_n_zip}((find(file_address_zip{file_n_zip} == '/', 1, 'last') + 1):end);
end
else
% Please, make a pause from a download and the next
a = toc;
if a < 2 * webpage_n, pause(2 * webpage_n - a), end;
% Go to the next list
webpage_n = webpage_n + 1;
webpage_address{webpage_n} = strcat('http://www.gutenberg.org/robot/', webpage_contents{webpage_n - 1}((start_from_here + text_to_be_found_length):finish_here));
webpage_contents{webpage_n} = urlread(webpage_address{webpage_n});
break;
end
end
if end_of_search, break, end
disp(webpage_n)
% It is recommended to store the data every 100 loops or so
if rem(webpage_n, 100) == 0, save('gutenberg_files', 'webpage_contents', 'webpage_address', 'file_name_mp3', 'file_name_zip', 'file_address_mp3', 'file_address_zip'); end
end
% Delete doubles among addresses (mp3)
[file_address_mp3, indexes] = unique(file_address_mp3);
file_n_mp3 = length(indexes);
clear temp
for i = 1:file_n_mp3
temp{i} = file_name_mp3{indexes(i)};
end
file_name_mp3 = temp;
% Delete doubles among addresses (zip)
[file_address_zip, indexes] = unique(file_address_zip);
file_n_zip = length(indexes);
clear temp
for i = 1:file_n_zip
temp{i} = file_name_zip{indexes(i)};
end
file_name_zip = temp;
% Sort for file_name_mp3
[file_name_mp3, indexes] = sort(file_name_mp3);
clear temp
for i = 1:length(indexes)
temp{i} = file_address_mp3{indexes(i)};
end
file_address_mp3 = temp;
% Sort for file_name_zip
[file_name_zip, indexes] = sort(file_name_zip);
clear temp
for i = 1:length(indexes)
temp{i} = file_address_zip{indexes(i)};
end
file_address_zip = temp;
save('gutenberg_files', 'webpage_contents', 'webpage_address', 'file_name_mp3', 'file_name_zip', 'file_address_mp3', 'file_address_zip', 'file_n_mp3', 'file_n_zip', 'webpage_n');