No BSD License  

Highlights from
Analyze FEX download data

image thumbnail

Analyze FEX download data

by

 

25 Nov 2007 (Updated )

(with the aid of MySQL)

examineDownloadData.m
%% Connect to MySQL
mym('open','localhost','root','zhir')
mym('use test')

%% Increase selected MySQL buffers in a bid to boost performance
mym(['set ' ...
     'bulk_insert_buffer_size = 500000000, ' ... 
     'global key_buffer_size  = 500000000, ' ...
     'myisam_sort_buffer_size = 150000000, ' ...
     'sort_buffer_size        = 150000000']) 
 
%% Read download data into a MySQL table, retaining unique records
mym('drop table if exists temp')
mym('drop table if exists fex')
mym(['create table temp (' ...
     'time datetime, ' ...
     'id   mediumint unsigned, ' ...
     'rank smallint  unsigned, ' ...
     'file smallint  unsigned, ' ...
     'fcat tinyint   unsigned, ' ...
     'down smallint  unsigned)'])
mym(['load data infile ''D:/Program Files/Matlab/work/fexstat/fex.txt'' ' ...
     'into table temp lines terminated by ''\n'' ' ...
     '(@time,id,rank,file,fcat,down) ' ...
     'set time = str_to_date(@time,''%d-%b-%Y %H:%i:%s'')'])
mym(['create table fex as ' ...
     'select distinct * from temp ' ...
     'where file > 0 ' ...
     'order by file, time'])
 
%% Read category list into a MySQL table, retaining unique records
mym('drop table if exists temp')
mym('drop table if exists cat')
mym(['create table temp (' ...
     'fcat       tinyint unsigned, ' ...
     'drilldown  tinyint unsigned, ' ...
     'catname    varchar(70))'])
mym(['load data infile ''D:/Program Files/Matlab/work/fexstat/cat.txt'' ' ...
     'into table temp lines terminated by ''\n'' '])
mym('create table cat as select distinct * from temp order by fcat')

%% Switch from 'stock' to 'flow' downloads and change time accounting
mym('alter table fex add row int auto_increment primary key')
mym('drop table if exists filestats')
mym(['create table filestats as ' ...
     'select a.id, a.file, a.fcat, ' ...
     'date(a.time)      as date, ' ...
     'hour(a.time)      as hour, ' ...
     '(b.down - a.down) as down  ' ...
     'from fex a left join fex b ' ...
     'on  a.file = b.file ' ...
     'and b.row  = a.row + 1 ' ...
     'and timestampdiff(minute,a.time,b.time) between 40 and 80']);
mym('alter table filestats add scan int')
mym('update filestats set scan = 24*datediff(date,''2007-11-16'') + hour')

%% Index resulting table
mym('create index i on filestats(id)')
mym('create index j on filestats(file)')
mym('create index k on filestats(scan)')

%% Check completeness of author cross-sections, tracking the number of distinct ids found at any time
mym('drop table if exists temp')
mym('drop table if exists badscans')
mym(['create table temp as ' ...
     'select scan, count(distinct id) as n ' ...
     'from filestats ' ...
     'group by scan'])
mym(['create table badscans as ' ....
     'select scan ' ...
     'from temp ' ...
     'where n < 2560']) 
 
%% Calculate and plot downloads-per-hour totals for 'cleared' cross-sections
[t,x] = mym(['select distinct scan, sum(down) as n ' ...
             'from filestats ' ...
             'where scan not in (select scan from badscans) ' ...
             'group by scan ' ...
             'having n is not null']);
prepplot(t,x)
plot(t,x,'LineWidth',1,'Color',[.8 .8 .8],'LineStyle','.')
plot(t,hpfilter(x,4),'LineWidth',2,'Color',[.9  0  0],'LineStyle','-')
title('\bfDownloads of FEX submissions, November 16-22, 2007')
xlabel({'Number of downloads per hour, all available submissions', ...
        '(Local time GMT; solid line plots a Hodrick-Prescott smoothed series)'})

%% Examine the distribution of weekly downloads-per-author (ignoring incomplete cross-sections)
mym('drop table if exists authorstats')
mym('drop table if exists authortotals')
mym(['create table authorstats as ' ...
     'select distinct id, scan, sum(down) as down ' ...
     'from filestats ' ...
     'where scan not in (select scan from badscans) ' ...
     'group by id, scan'])
mym(['create table authortotals as ' ...
     'select distinct id, sum(down) as down ' ...
     'from authorstats ' ...
     'group by id ' ...
     'order by down desc'])
mym('alter table authortotals add rank smallint auto_increment primary key')

%% Plot download history for the contributor with most downloads over the period
[t,x] = mym(['select scan, down ' ...
             'from authorstats ' ...
             'where id in (select id from authortotals where rank = 1) ' ...
             'and down is not null']);
prepplot(t,x)
bar(t,x,'r')
title({'\bfDownloads of submissions by a selected contributor, November 16-22, 2007', ...
       '\rm(Contributor with most downloads over the observation period)'})
xlabel({'Number of downloads per hour','(Local time GMT)'})

%% Plot download history for the contributor with most downloads of a file in a single hour
[d,n] = mym(['select distinct down, '...
             'count(*) as n ' ...
             'from filestats ' ...
             'group by down ' ...
             'order by down desc']);
[d n]                                                           %#ok
mym('drop table if exists temp1')
mym(['create table temp1 as ' ...
     'select distinct id' ...
     'from filestats ' ...
     'where down > 300'])
[t,x] = mym(['select scan, down ' ...
             'from authorstats ' ...
             'where id in (select id from temp1) ' ...
             'and down is not null']);

prepplot(t,x)
bar(t,x,'r')
title({'\bfDownloads of submissions by a selected contributor, November 16-22, 2007', ...
       '\rm(Contributor with most downloads of a file in a single hour)'})
xlabel({'Number of downloads per hour','(Local time GMT)'})

%% Calculate and plot downloads per category
mym('drop table if exists temp')
mym('drop table if exists cattotals')
mym(['create table temp as ' ...
     'select distinct fcat, sum(down) as down ' ...
     'from filestats ' ...
     'where scan not in (select scan from badscans) ' ...
     'group by fcat ' ...
     'order by down'])
mym(['create table cattotals as '...
     'select temp.*, cat.catname, cat.drilldown ' ...
     'from temp left join cat using (fcat) ' ...
     'order by down desc'])
[x,n] = mym('select down, catname from cattotals');
n = deblank(n);
c = x/sum(x) < .009;
x = [x(~c); sum(x(c))]; 
n = [n(~c); {'Other'}];
n = strcat('\fontsize{7}',n);
for i = 1:length(n)
    n{i} = [n{i} ', ' num2str(x(i))];
end    
figure
set(gcf,'Color','w')
pie(x,n)
colormap autumn
title('\bfDownloads per category, November 16-22, 2007')

%%
% Example: none (Oh, the FEX code metrics..)

Contact us