Mercurial > forge
changeset 12226:0117d73458d7 octave-forge
Replace fgetl calls by fread to cope with EOLs; Adapted regular expressions; fix regexpr for string search; add isfinite() check; replace call to __col_str_to_number with __OCT_cc__
author | prnienhuis |
---|---|
date | Fri, 20 Dec 2013 13:11:15 +0000 |
parents | 0504f7cafe76 |
children | 3b4a57d5eab2 |
files | main/io/inst/private/__OCT_xlsx2oct__.m |
diffstat | 1 files changed, 33 insertions(+), 24 deletions(-) [+] |
line wrap: on
line diff
--- a/main/io/inst/private/__OCT_xlsx2oct__.m Fri Dec 20 13:07:16 2013 +0000 +++ b/main/io/inst/private/__OCT_xlsx2oct__.m Fri Dec 20 13:11:15 2013 +0000 @@ -1,4 +1,5 @@ -## Copyright (C) 2013 Markus Bergholz +## Copyright (C) 2013 Markus Bergholz +## Parts Copyright (C) 2013 Philip Nienhuis ## ## This program is free software; you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by @@ -27,20 +28,26 @@ ## 2010-10-20 Transplanted & adapted section for text string worksheet names (PRN) ## '' Much code restyled into Octave coding conventions ## 2013-10-28 More fixes by Markus -## 2013-11-02 Added rstatus return arg (needed by xlsread.m) +## 2013-11-02 (PRN) Added rstatus return arg (needed by xlsread.m) ## 2013-11-04 (PRN) Adapted regexp search strings to include (numeric) formulas and booleans ## '' (PRN) Commented out code for only numeric data until contiguousness is checked -## 2013-11-08 Fix reading date/time +## 2013-11-08 (PRN) Fix reading date/time ## '' Rework code to keep cell array, not a numeric matrix ## '' Add reading formulas -## 2013-11-09 Add reading strings & formulas +## 2013-11-09 (PRN) Add reading strings & formulas ## '' Rearrange code ## '' Prepare for fast reading (still uncommented) ## '' Implement selected range (still rough ATM but for devs the easiest) -## 2013-11-10 Fix typo preventing reading named worksheets instead of indices -## 2013-11-13 Pretty text output -## 2013-11-15 Catch empty sharedString.xml (means no fixed strings) -## 2013-11-16 Replace fgetl calls by fread to cope with EOLs +## 2013-11-10 (PRN) Fix typo preventing reading named worksheets instead of indices +## 2013-11-13 (PRN) Pretty text output +## 2013-11-15 (PRN) Catch empty sharedString.xml (means no fixed strings) +## 2013-11-16 (PRN) Replace fgetl calls by fread to cope with EOLs +## 2013-12-12 (PRN) Adapted regular exprssions for formulas_as_text to cope with POI +## '' Adapted regular expression for shared strings to cope with empty strings +## 2013-12-14 (PRN) Adapt regexpr for values (include "n" as value type) +## '' Fix regexpr for strings ( <v>\d? => <v>\d+ ) +## '' Add isfinite() check before attempt to process fixed strings +## 2013-12-19 (MB) Replace call to __col_str_to_number with __OCT_cc__ function [ raw, xls, rstatus ] = __OCT_xlsx2oct__ (xls, wsh, crange='', spsh_opts) @@ -119,12 +126,12 @@ ## General note for tuning: '"([^"]*)"' (w/o single quotes) could be faster than '"(.*?)"' ## (http://stackoverflow.com/questions/2503413/regular-expression-to-stop-at-first-match comment #7) - ## Beloware loads of nested IFs. They're needed to catch empty previous results, even empty sheets + ## Below are loads of nested IFs. They're needed to catch empty previous results, even empty sheets ## 1. Get pure numbers, including booleans, double and boolean formula results, from cells w/o 's=""' tag - val = cell2mat (regexp (rawdata, '<c r="\w+"(?: t="[b]+")?>(?:<f.+?(?:</f>|/>))?<v>(.*?)</v>', "tokens")); + val = cell2mat (regexp (rawdata, '<c r="\w+"(?: t="[bn]+")?>(?:<f.+?(?:</f>|/>))?<v>(.*?)</v>', "tokens")); if (! isempty (val)) - valraw = cell2mat (regexp (rawdata, '<c r="(\w+)"(?: t="[b]+")?>(?:<f.+?(?:</f>|/>))?<v>.*?</v>', "tokens")); + valraw = cell2mat (regexp (rawdata, '<c r="(\w+)"(?: t="[bn]+")?>(?:<f.+?(?:</f>|/>))?<v>.*?</v>', "tokens")); endif ## If val is still empty, try another regexpression (PRN: will this ever work? haven't seen such cells) @@ -154,12 +161,13 @@ endif ## Turn strings into numbers val = num2cell (str2double (val)); - + ## 2. String / text formulas (cached results are in this sheet; fixed strings in <sharedStrings.xml>) ## Formulas - if (spsh_opts.formulas_as_text) + if (spsh_opts.formulas_as_text) + ## Get formulas themselves as text strings. Formulas have no 't="s"' attribute. Keep starting '>' for next line - valf1 = cell2mat (regexp (rawdata, '<c r="\w+" s="\d"(?: t="(?:[^s]?|str)")?><f(?: .*?)*(>.+?)</f><v>.*?</v>', "tokens")); + valf1 = cell2mat (regexp (rawdata, '<c r="\w+"(?: s="\d+")?(?: t="\w+")?><f(?: .*?)*(>.*?)</f>(?:<v>.*?</v>)?', "tokens")); if (! isempty (valf1)) valf1 = regexprep (valf1, '^>', '='); ## Pretty text output @@ -167,7 +175,7 @@ valf1 = strrep (valf1, "<", "<"); valf1 = strrep (valf1, ">", ">"); valf1 = strrep (valf1, "&", "&"); - valrawf1 = cell2mat(regexp (rawdata, '<c r="(\w+)" s="\d"(?: t="(?:[^s]?|str)")?>(?:<f(?: .*?)*>.+?</f>)<v>.*?</v>', "tokens")); + valrawf1 = cell2mat(regexp (rawdata, '<c r="(\w+)"(?: s="\d+")?(?: t="\w+")?><f(?: .*?)*>.*?</f>(?:<v>.*?</v>)?', "tokens")); if (isempty (val)) val = valf1; else @@ -187,7 +195,7 @@ valf2 = strrep (valf2, "<", "<"); valf2 = strrep (valf2, ">", ">"); valf2 = strrep (valf2, "&", "&"); - valrawf2 = cell2mat(regexp (rawdata, '<c r="(\w+)" s="\d" t="(?:[^sb]?|str)">(?:<f.+?(?:</f>|/>))<v>.*?</v>', "tokens")) + valrawf2 = cell2mat(regexp (rawdata, '<c r="(\w+)" s="\d" t="(?:[^sb]?|str)">(?:<f.+?(?:</f>|/>))<v>.*?</v>', "tokens")); if (isempty (val)) val = valf2; valraw = valrawf2; @@ -210,15 +218,16 @@ endif endif clear valf3 valrawf3 ; - endif + endif + ## 3. Strings if (! isempty (strings)) ## Extract string values. May be much more than present in current sheet - ctext = cell2mat (regexp (strings, '<si><t>(.+?)</t></si>', "tokens")); + ctext = cell2mat (regexp (strings, '<si><t(?:>(.+?)</t>|(.*)/>)</si>', "tokens")); ## Pointers into sharedStrings.xml. "Hard" (fixed) strings have 't="s"' attribute ## For reasons known only to M$ those pointers are zero-based, so: - vals = str2double (cell2mat (regexp (rawdata, '<c r="\w+"(?: s="\d")? t="s"><v>(\d?)</v>', "tokens"))) + 1; - if (! isempty (vals)) + vals = str2double (cell2mat (regexp (rawdata, '<c r="\w+"(?: s="\d")? t="s"><v>(\d+)</v>', "tokens"))) + 1; + if (! isempty (vals) && isfinite (vals)) ## Get actual values vals = ctext(vals); ## Pretty text output @@ -227,7 +236,7 @@ vals = strrep (vals, ">", ">"); vals = strrep (vals, "&", "&"); ## Cell addresses - valraws = cell2mat (regexp (rawdata, '<c r="(\w+)"(?: s="\d")? t="s"><v>\d?</v>', "tokens")); + valraws = cell2mat (regexp (rawdata, '<c r="(\w+)"(?: s="\d")? t="s"><v>\d+</v>', "tokens")); if (isempty (val)) val = vals; valraw = valraws; @@ -238,7 +247,7 @@ endif clear vals valraws ; endif - + ## If val is empty, sheet is empty if (isempty (val)) xls.limits = []; @@ -261,7 +270,7 @@ if (0 < numel (idx.all)) idx.num = str2double (cell2mat (regexp (idx.all, '(\d+|\d+\d+|\d+\d+\d+|\d+\d+\d+\d+|\d+\d+\d+\d+\+d|\d+\d+\d+\d+\d+\d+)?', "match"))')'; idx.alph = cell2mat (regexp (idx.all, '([A-Za-z]+|[A-Za-z]+[A-Za-z]+|[A-Za-z]+[A-Za-z]+[A-Za-z]+)?', "match")); - idx.alph = double (cell2mat (cellfun (@__col_str_to_number, vi.alph, "UniformOutput", 0))); + idx.alph = double (cell2mat (cellfun (@__OCT_cc__, vi.alph, "UniformOutput", 0))); else ## To prevent warnings or errors while calculating corresponding NaN matrix idx.num = []; @@ -269,7 +278,7 @@ end ## Transform column character to column number ## A -> 1; C -> 3, AB -> 28 ... - vi.col = double (cell2mat (cellfun (@__col_str_to_number, vi.alph, "UniformOutput", 0))); + vi.col = double (cell2mat (cellfun (@__OCT_cc__, vi.alph, "UniformOutput", 0))); ## Find data rectangle limits idx.mincol = min ([idx.alph vi.col]);