view main/io/inst/private/__OCT_gnm2oct__.m @ 12378:355d7a8e6382 octave-forge

FormatValue tag added in initial regexp parsing pattern
author prnienhuis
date Thu, 20 Feb 2014 21:58:20 +0000
parents 46201978102c
children 969438a9c069
line wrap: on
line source

## Copyright (C) 2013,2014 Philip Nienhuis
## 
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 3 of the License, or
## (at your option) any later version.
## 
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
## GNU General Public License for more details.
## 
## You should have received a copy of the GNU General Public License
## along with Octave; see the file COPYING.  If not, see
## <http://www.gnu.org/licenses/>.

## -*- texinfo -*- 
## @deftypefn {Function File} [@var{raw}, @var{ods}, @var{rstatus} = __OCT_gnm2oct__ (@var{ods}, @var{wsh}, @var{range}, @var{opts})
## Internal function for reading data from a Gnumeric worksheet
##
## @seealso{}
## @end deftypefn

## Author: Philip Nienhuis <prnienhuis at users.sf.net>
## Created: 2013-10-01
## Updates:
## 2013-10-02 Drop return arg rstatus
## 2013-10-02 Significant speed-up using regexp and splitting xml in chunks ~4e5 chars
## 2013-11-03 Fix processing chunks
##     ''     Get ValueType using getxmlattv, not regexp
##     ''     Process Boolean type
## 2013-11-15 Replace slow xml node parsing by regexp a la Markus Bergholz
## 2014-02-20 Add ValueFormat tag to regexp pattern

function [ rawarr, xls, rstatus] = __OCT_gnm2oct__ (xls, wsh, cellrange='', spsh_opts)

  rstatus = 0;

  ## Check if requested worksheet exists in the file & if so, get sheet
  if (isnumeric (wsh))
    if (wsh > numel (xls.sheets.sh_names) || wsh < 1)
      error ("xls2oct: sheet number (%d) out of range (1 - %d)", wsh, numel (xls.sheets.sh_names));
    endif
  elseif (ischar (wsh))
    idx = strmatch (wsh, xls.sheets.sh_names);
    if (isempty (idx))
      error ("xls2oct: sheet '%s' not found in file %s", wsh, xls.filename);
    endif
    wsh = idx;
  endif

  ## Get requested sheet from info in file struct pointer. Open file
  fid = fopen (xls.workbook, "r");
  ## Go to start of requested sheet. This requires reading from start, not
  ## from after 1st xml id line
  fseek (fid, xls.sheets.shtidx(wsh), 'bof');
  ## Compute size of requested chunk
  nchars = xls.sheets.shtidx(wsh+1) - xls.sheets.shtidx(wsh);
  ## Get the sheet
  xml = fread (fid, nchars, "char=>char").';
  fclose (fid);

  ## Add xml to struct pointer to avoid __OCT_getusedrange__ to read it again
  xls.xml = xml;

  ## Check ranges
  [ firstrow, lastrow, leftcol, rightcol ] = getusedrange (xls, wsh);

  ## Remove xml field
  xls.xml = [];
  xls = rmfield (xls, "xml");

  if (isempty (cellrange))
    if (firstrow == 0 && lastrow == 0)
      ## Empty sheet
      rawarr = {};
      printf ("Worksheet '%s' contains no data\n", xls.sheets.sh_names{wsh});
      rstatus = 1;
      return;
    else
      nrows = lastrow - firstrow + 1;
      ncols = rightcol - leftcol + 1;
    endif
  else
    [~, nr, nc, tr, lc] = parse_sp_range (cellrange);
    ## Check if requested range exists
    if (tr > lastrow || lc > rightcol)
      ## Out of occupied range
      warning ("xls2oct: requested range outside occupied range");
      rawarr = {};
      xls.limits = [];
      return
    endif
    
    lastrow  = min (lastrow, firstrow + nr - 1);
    rightcol = min (rightcol, leftcol + nc - 1);
  endif

  ## Get cell nodes
  cells = getxmlnode (xml, "gnm:Cells");

  ## Pattern gets all required tokens in one fell swoop
  pattrn = '<gnm:Cell Row="(\d*?)" Col="(\d*?)" (?:ValueType="(\d*?)"|ExprID="(\d*?)")(?: ValueFormat="\w+")>(.*?)</gnm:Cell>';
  allvals = cell2mat (regexp (cells, pattrn, "tokens"));

  ## Reshape into 4 x ... cell array
  allvals = reshape (allvals, 4,  numel (allvals) / 4);

  ## Convert 0-based rw/column indices to 1-based numeric
  allvals(1:2, :) = num2cell (str2double (allvals(1:2, :)) + 1);
  ## Convert cell type values to double
  allvals(3, :)   = num2cell (str2double (allvals(3, :)));
  ## Convert numeric cell values to double
  in = find ([allvals{3,:}] == 40);
  allvals(4, in)  =  num2cell (str2double(allvals(4, in))');
  ## Convert boolean values to logical
  il = find ([allvals{3,:}] == 20);
  allvals(4, il)  =  num2cell (cellfun (@(x) strcmpi (x, "true"), allvals(4, il)));

  ## Get limits of data rectangle
  trow = min (cell2mat (allvals(1, :)));
  brow = max (cell2mat (allvals(1, :)));
  rcol = max (cell2mat (allvals(2, :)));
  lcol = min (cell2mat (allvals(2, :)));
  xls.limits = [lcol rcol; trow brow];

  ## Create data array
  rawarr = cell (brow-trow+1, rcol-lcol+1);

  ## Compute linear indices into data array from 1-based row/col indices
  idx = sub2ind (size (rawarr), [allvals{1, :}] - trow + 1, [allvals{2,:}] - lcol + 1);
  ## And assign cell values to data array
  rawarr(idx) = allvals(4, :);

  ## FIXME maybe reading parts of the data can be done faster above by better regexps
  ##       or sorting on row & truncating followed by sorting on columns and truncating
  if (! isempty (cellrange))
    ## We'll do it the easy way: just read all data, then return only the requested part
    xls.limits = [max(lcol, lc), min(rcol, lc+nc-1); max(trow, tr), min(brow, tr+nr-1)];
    ## Correct spreadsheet locations for lower right shift or raw
    rc = trow - 1;
    cc = lcol - 1;
    rawarr = rawarr(xls.limits(2, 1)-rc : xls.limits(2, 2)-rc, xls.limits(1, 1)-cc : xls.limits(1, 2)-cc);
  endif
 
  if (! isempty (allvals))
    rstatus = 1;
  endif

endfunction