changeset 12226:0117d73458d7 octave-forge

Replace fgetl calls by fread to cope with EOLs; Adapted regular expressions; fix regexpr for string search; add isfinite() check; replace call to __col_str_to_number with __OCT_cc__
author prnienhuis
date Fri, 20 Dec 2013 13:11:15 +0000
parents 0504f7cafe76
children 3b4a57d5eab2
files main/io/inst/private/__OCT_xlsx2oct__.m
diffstat 1 files changed, 33 insertions(+), 24 deletions(-) [+]
line wrap: on
line diff
--- a/main/io/inst/private/__OCT_xlsx2oct__.m	Fri Dec 20 13:07:16 2013 +0000
+++ b/main/io/inst/private/__OCT_xlsx2oct__.m	Fri Dec 20 13:11:15 2013 +0000
@@ -1,4 +1,5 @@
-## Copyright (C) 2013 Markus Bergholz
+## Copyright (C) 2013 Markus Bergholz
+## Parts Copyright (C) 2013 Philip Nienhuis
 ## 
 ## This program is free software; you can redistribute it and/or modify
 ## it under the terms of the GNU General Public License as published by
@@ -27,20 +28,26 @@
 ## 2010-10-20 Transplanted & adapted section for text string worksheet names (PRN)
 ##     ''     Much code restyled into Octave coding conventions
 ## 2013-10-28 More fixes by Markus
-## 2013-11-02 Added rstatus return arg (needed by xlsread.m)
+## 2013-11-02 (PRN) Added rstatus return arg (needed by xlsread.m)
 ## 2013-11-04 (PRN) Adapted regexp search strings to include (numeric) formulas and booleans
 ##     ''     (PRN) Commented out code for only numeric data until contiguousness is checked
-## 2013-11-08 Fix reading date/time
+## 2013-11-08 (PRN) Fix reading date/time
 ##     ''     Rework code to keep cell array, not a numeric matrix
 ##     ''     Add reading formulas
-## 2013-11-09 Add reading strings & formulas
+## 2013-11-09 (PRN) Add reading strings & formulas
 ##     ''     Rearrange code
 ##     ''     Prepare for fast reading (still uncommented)
 ##     ''     Implement selected range (still rough ATM but for devs the easiest)
-## 2013-11-10 Fix typo preventing reading named worksheets instead of indices
-## 2013-11-13 Pretty text output
-## 2013-11-15 Catch empty sharedString.xml (means no fixed strings)
-## 2013-11-16 Replace fgetl calls by fread to cope with EOLs
+## 2013-11-10 (PRN) Fix typo preventing reading named worksheets instead of indices
+## 2013-11-13 (PRN) Pretty text output
+## 2013-11-15 (PRN) Catch empty sharedString.xml (means no fixed strings)
+## 2013-11-16 (PRN) Replace fgetl calls by fread to cope with EOLs
+## 2013-12-12 (PRN) Adapted regular exprssions for formulas_as_text to cope with POI
+##     ''     Adapted regular expression for shared strings to cope with empty strings
+## 2013-12-14 (PRN) Adapt regexpr for values (include "n" as value type)
+##     ''     Fix regexpr for strings ( <v>\d? => <v>\d+ )
+##     ''     Add isfinite() check before attempt to process fixed strings
+## 2013-12-19 (MB) Replace call to __col_str_to_number with __OCT_cc__
 
 function [ raw, xls, rstatus ] = __OCT_xlsx2oct__ (xls, wsh, crange='', spsh_opts)
 
@@ -119,12 +126,12 @@
     ## General note for tuning: '"([^"]*)"' (w/o single quotes) could be faster than '"(.*?)"'
     ## (http://stackoverflow.com/questions/2503413/regular-expression-to-stop-at-first-match comment #7)
 
-    ## Beloware loads of nested IFs. They're needed to catch empty previous results, even empty sheets
+    ## Below are loads of nested IFs. They're needed to catch empty previous results, even empty sheets
 
     ## 1. Get pure numbers, including booleans, double and boolean formula results, from cells w/o 's=""' tag
-    val = cell2mat    (regexp (rawdata, '<c r="\w+"(?: t="[b]+")?>(?:<f.+?(?:</f>|/>))?<v>(.*?)</v>', "tokens"));
+    val = cell2mat (regexp (rawdata, '<c r="\w+"(?: t="[bn]+")?>(?:<f.+?(?:</f>|/>))?<v>(.*?)</v>', "tokens"));
     if (! isempty (val))
-      valraw = cell2mat (regexp (rawdata, '<c r="(\w+)"(?: t="[b]+")?>(?:<f.+?(?:</f>|/>))?<v>.*?</v>', "tokens"));
+      valraw = cell2mat (regexp (rawdata, '<c r="(\w+)"(?: t="[bn]+")?>(?:<f.+?(?:</f>|/>))?<v>.*?</v>', "tokens"));
     endif
 
     ## If val is still empty, try another regexpression (PRN: will this ever work? haven't seen such cells)
@@ -154,12 +161,13 @@
     endif
     ## Turn strings into numbers
     val = num2cell (str2double (val)); 
-
+
     ## 2. String / text formulas (cached results are in this sheet; fixed strings in <sharedStrings.xml>)
     ## Formulas
-    if (spsh_opts.formulas_as_text)
+    if (spsh_opts.formulas_as_text)
+
       ## Get formulas themselves as text strings. Formulas have no 't="s"' attribute. Keep starting '>' for next line
-      valf1 = cell2mat (regexp (rawdata, '<c r="\w+" s="\d"(?: t="(?:[^s]?|str)")?><f(?: .*?)*(>.+?)</f><v>.*?</v>', "tokens"));
+      valf1 = cell2mat (regexp (rawdata,     '<c r="\w+"(?: s="\d+")?(?: t="\w+")?><f(?: .*?)*(>.*?)</f>(?:<v>.*?</v>)?', "tokens"));
       if (! isempty (valf1))
         valf1 = regexprep (valf1, '^>', '=');
         ## Pretty text output
@@ -167,7 +175,7 @@
         valf1 = strrep (valf1, "&lt;", "<");
         valf1 = strrep (valf1, "&gt;", ">");
         valf1 = strrep (valf1, "&amp;", "&");
-        valrawf1 = cell2mat(regexp (rawdata, '<c r="(\w+)" s="\d"(?: t="(?:[^s]?|str)")?>(?:<f(?: .*?)*>.+?</f>)<v>.*?</v>', "tokens"));
+        valrawf1 = cell2mat(regexp (rawdata, '<c r="(\w+)"(?: s="\d+")?(?: t="\w+")?><f(?: .*?)*>.*?</f>(?:<v>.*?</v>)?', "tokens"));
         if (isempty (val))
           val = valf1;
         else
@@ -187,7 +195,7 @@
         valf2 = strrep (valf2, "&lt;", "<");
         valf2 = strrep (valf2, "&gt;", ">");
         valf2 = strrep (valf2, "&amp;", "&");
-        valrawf2 = cell2mat(regexp (rawdata, '<c r="(\w+)" s="\d" t="(?:[^sb]?|str)">(?:<f.+?(?:</f>|/>))<v>.*?</v>', "tokens"))
+        valrawf2 = cell2mat(regexp (rawdata, '<c r="(\w+)" s="\d" t="(?:[^sb]?|str)">(?:<f.+?(?:</f>|/>))<v>.*?</v>', "tokens"));
         if (isempty (val))
           val = valf2;
           valraw = valrawf2;
@@ -210,15 +218,16 @@
         endif
       endif
       clear valf3 valrawf3 ;
-    endif
+    endif
+
     ## 3. Strings
     if (! isempty (strings))
       ## Extract string values. May be much more than present in current sheet
-      ctext = cell2mat (regexp (strings, '<si><t>(.+?)</t></si>', "tokens"));
+      ctext = cell2mat (regexp (strings, '<si><t(?:>(.+?)</t>|(.*)/>)</si>', "tokens"));
       ## Pointers into sharedStrings.xml. "Hard" (fixed) strings have 't="s"' attribute
       ## For reasons known only to M$ those pointers are zero-based, so:
-      vals = str2double (cell2mat (regexp (rawdata, '<c r="\w+"(?: s="\d")? t="s"><v>(\d?)</v>', "tokens"))) + 1;
-      if (! isempty (vals))
+      vals = str2double (cell2mat (regexp (rawdata, '<c r="\w+"(?: s="\d")? t="s"><v>(\d+)</v>', "tokens"))) + 1;
+      if (! isempty (vals) && isfinite (vals))
         ## Get actual values
         vals = ctext(vals);
         ## Pretty text output
@@ -227,7 +236,7 @@
         vals = strrep (vals, "&gt;", ">");
         vals = strrep (vals, "&amp;", "&");
          ## Cell addresses
-        valraws = cell2mat (regexp (rawdata, '<c r="(\w+)"(?: s="\d")? t="s"><v>\d?</v>', "tokens"));
+        valraws = cell2mat (regexp (rawdata, '<c r="(\w+)"(?: s="\d")? t="s"><v>\d+</v>', "tokens"));
         if (isempty (val))
           val = vals;
           valraw = valraws;
@@ -238,7 +247,7 @@
       endif
       clear vals valraws ;
     endif
-
+
     ## If val is empty, sheet is empty
     if (isempty (val))
       xls.limits = [];
@@ -261,7 +270,7 @@
     if (0 < numel (idx.all))
       idx.num = str2double (cell2mat (regexp (idx.all, '(\d+|\d+\d+|\d+\d+\d+|\d+\d+\d+\d+|\d+\d+\d+\d+\+d|\d+\d+\d+\d+\d+\d+)?', "match"))')';
       idx.alph = cell2mat (regexp (idx.all, '([A-Za-z]+|[A-Za-z]+[A-Za-z]+|[A-Za-z]+[A-Za-z]+[A-Za-z]+)?', "match"));
-      idx.alph = double (cell2mat (cellfun (@__col_str_to_number, vi.alph, "UniformOutput", 0)));
+      idx.alph = double (cell2mat (cellfun (@__OCT_cc__, vi.alph, "UniformOutput", 0)));
     else
       ## To prevent warnings or errors while calculating corresponding NaN matrix
       idx.num = [];
@@ -269,7 +278,7 @@
     end
     ## Transform column character to column number
     ## A -> 1; C -> 3, AB -> 28 ...
-    vi.col = double (cell2mat (cellfun (@__col_str_to_number, vi.alph, "UniformOutput", 0)));
+    vi.col = double (cell2mat (cellfun (@__OCT_cc__, vi.alph, "UniformOutput", 0)));
 
     ## Find data rectangle limits
     idx.mincol = min ([idx.alph vi.col]);