image thumbnail

Fetch (Big)

by

 

27 Apr 2012 (Updated )

Fetch command useful for SELECT queries on large data sets.

fetch_big(connection, query, varargin)
function out = fetch_big(connection, query, varargin)

% out = fetch_big(connection, query, ...)
%
% Fetch command useful for SELECT queries on large data sets. First, it
% runs a query to determine the number of rows being returned, then
% preallocates the space necessary  for the input query's result, then runs
% the main query, fetching the results in batches.
% 
% This allows pulling much more information from a database in a single 
% query than the built-in methods (by using batches to avoid using too much
% of the Java heap) and does this much faster (by preallocating space for 
% the output).
%
% For queries with forms other than 'SELECT col1, col2, ... coln FROM table
% ...', it is necessary to provide an input "count" query to determine the
% number of rows, such as 'SELECT COUNT(*) FROM table...'.
%
% Inputs:
% connection - An open database connection
% query      - SQL query to use
%
% Outputs:
% out        - Result formatted according to setdbprefs('DataReturnFormat')
%
% out = fetch_big(connection, query) tries to auto-generate the count
% query as 'SELECT COUNT(*) FROM table' where |table| is determined from
% the input, tries to determine how many columns are being selected from 
% the query, and uses the default batch size (10,000).
%
% out = fetch_big(connection, query, count_query) does the same as above, 
% but uses the input row count query (e.g., 'SELECT COUNT(*) FROM table').
%
% out = fetch_big(connection, query, count_query, batch_size)
% does the same as the above, but uses the input batch size -- the number
% of rows returned from the database at a time.
%
% Note: When autodetecting the count query, the query must be of the form:
%
% SELECT ... FROM table ...
%
% Thanks to Asmita Barve at The MathWorks for her input.
% 
% Tucker McClure @ The MathWorks
% Copyright 2012 MathWorks, Inc.

    % First, make sure the connection is valid.
    if ~isconnection(connection)
        error('fetch_big:InvalidConnection', ...
              'Invalid connection. Please connect to the database first.');
    end
    
    % Get the input count query, or let it default to ''.
    count_query = getargin(varargin, 1, '');
    
    % Get the input batch size or let it default to 2500.
    batch_size = getargin(varargin, 2, 2500);
    
    % If the use didn't provide the count query, then we need to figure 
    % it out ourselves.
    if isempty(count_query)
        
        % Figure out what's being selected and from where.
        tokens = regexpi(query, 'SELECT .+ FROM (.+)', 'tokens');
                
        % If the count_query isn't set, generate it.
        if isempty(tokens)
            error('fetch_big:NeedAdditionalInfo', ...
                  ['Cannot auto-determine count query. '...
                   'Please provide that argument explicitly.']);
        else
            count_query = ['SELECT COUNT(*) FROM ' tokens{1}{1}];
        end
        
    end

    % Count the number of rows we'll be getting back.
    data_return_format = setdbprefs('DataReturnFormat');
    setdbprefs('DataReturnFormat', 'numeric');
    num_rows = fetch(connection, count_query);
    setdbprefs('DataReturnFormat', data_return_format);

    % Now's time for the actual query.
    result_cursor = exec(connection, query);

    % We have to do different things for each data return type.
    switch data_return_format
        
        % For numeric and cell arrays, we can just initialize differently
        % but populate the same way.
        case {'numeric', 'cellarray'}
            
            % Use one database call to get all the fields, then set the
            % right sizes, then enter the loop for the remaining data.
            result = fetch(result_cursor, 1);
            num_columns = size(result.Data, 2);
            
            % Figure out initialization.
            if strcmp(data_return_format, 'numeric')
                out = zeros(num_rows, num_columns);
            else
                out = cell(num_rows, num_columns);
            end
            
            % Set the first row.
            out(1, :) = result.Data;
            
            % Loop over all batches.
            for k = 2:batch_size:num_rows
                result = fetch(result_cursor, batch_size);
                out(k:(k+size(result.Data, 1)-1), 1:num_columns) = ...
                    result.Data;
            end
            
        case {'dataset', 'structure'}
            
            % Use one database call to get all the fields, then set the
            % right sizes, then enter the loop for the remaining data.
            result = fetch(result_cursor, 1);
            out = result.Data;
            
            % We need to get all the field names, which is different for
            % structures than for datasets.
            if strcmp(data_return_format, 'dataset')
                out_fields = out.Properties.VarNames;
            else
                out_fields = fields(out);
            end
            
            % Now preallocate all the space we'll need. TODO: Add type
            % sensativity.
            for k = 1:length(out_fields)
                if iscell(out.(out_fields{k}))
                    out.(out_fields{k}){2:num_rows, 1} = [];
                else
                    out.(out_fields{k})(2:num_rows, 1) = 0;
                end
            end
            
            % Loop over all batches.
            for k = 2:batch_size:num_rows
                result = fetch(result_cursor, batch_size);
                n_these_rows = size(result.Data.(out_fields{1}), 1);
                for field_num = 1:length(out_fields)
                    field = out_fields{field_num};
                    out.(field)(k:(k+n_these_rows-1), 1) = ...
                        result.Data.(field);
                end
            end

    end

    % Close resources.
    close(result);
    close(result_cursor);

end

Contact us