comparison libinterp/corefcn/oct-stream.cc @ 30833:cfb708de1fc9

textscan: Refresh buffer with overlay (bug #62152). * libinterp/corefcn/oct-stream.cc (delimited_stream): Retain overlap with previous buffer content when refreshing to allow "putback" after buffer refresh. (delimited_stream::remaining): Add function that returns the number of chars until end of stream if it is already buffered. (textscan): Use functions that refresh the buffer when peeking into delimited_stream. Avoid buffer refresh if end of stream is already buffered.
author Markus Mützel <markus.muetzel@gmx.de>
date Sun, 13 Mar 2022 11:30:55 +0100
parents 0826c503f294
children 79edd49a5a97
comparison
equal deleted inserted replaced
30832:3e395f6fc03a 30833:cfb708de1fc9
32 #include <cstring> 32 #include <cstring>
33 33
34 #include <algorithm> 34 #include <algorithm>
35 #include <deque> 35 #include <deque>
36 #include <fstream> 36 #include <fstream>
37 #include <limits>
37 #include <iomanip> 38 #include <iomanip>
38 #include <iostream> 39 #include <iostream>
39 #include <sstream> 40 #include <sstream>
40 #include <string> 41 #include <string>
41 42
1243 refresh_buf (); 1244 refresh_buf ();
1244 } 1245 }
1245 1246
1246 // Load new data into buffer, and set eob, last, idx. 1247 // Load new data into buffer, and set eob, last, idx.
1247 // Return EOF at end of file, 0 otherwise. 1248 // Return EOF at end of file, 0 otherwise.
1248 int refresh_buf (void); 1249 int refresh_buf (bool initialize = false);
1249 1250
1250 // Get a character, relying on caller to call field_done if 1251 // Get a character, relying on caller to call field_done if
1251 // a delimiter has been reached. 1252 // a delimiter has been reached.
1252 int get (void) 1253 int get (void)
1253 { 1254 {
1288 1289
1289 void seekg (char *old_idx) { m_idx = old_idx; } 1290 void seekg (char *old_idx) { m_idx = old_idx; }
1290 1291
1291 bool eof (void) 1292 bool eof (void)
1292 { 1293 {
1293 return (m_eob == m_buf && m_i_stream.eof ()) 1294 return (m_eob == m_buf + m_overlap && m_i_stream.eof ())
1294 || (m_flags & std::ios_base::eofbit); 1295 || (m_flags & std::ios_base::eofbit);
1295 } 1296 }
1296 1297
1297 operator const void* (void) 1298 operator const void* (void)
1298 { return (! eof () && ! m_flags) ? this : nullptr; } 1299 { return (! eof () && ! m_flags) ? this : nullptr; }
1314 1315
1315 void progress_benchmark (void) { m_progress_marker = m_idx; } 1316 void progress_benchmark (void) { m_progress_marker = m_idx; }
1316 1317
1317 bool no_progress (void) { return m_progress_marker == m_idx; } 1318 bool no_progress (void) { return m_progress_marker == m_idx; }
1318 1319
1320 // Number of characters remaining until end of stream if it is already
1321 // buffered. int_max otherwise.
1322
1323 std::ptrdiff_t remaining (void)
1324 {
1325 if (m_eob < m_buf + m_bufsize)
1326 return m_eob - m_idx;
1327 else
1328 return std::numeric_limits<std::ptrdiff_t>::max ();
1329 }
1330
1319 private: 1331 private:
1320 1332
1321 // Number of characters to read from the file at once. 1333 // Number of characters to read from the file at once.
1322 int m_bufsize; 1334 int m_bufsize;
1323 1335
1334 // delimited is false). 1346 // delimited is false).
1335 char *m_last; 1347 char *m_last;
1336 1348
1337 // Position after last character in buffer. 1349 // Position after last character in buffer.
1338 char *m_eob; 1350 char *m_eob;
1351
1352 // Overlap with old content when refreshing buffer.
1353 std::ptrdiff_t m_overlap;
1339 1354
1340 // True if there is delimiter in the buffer after idx. 1355 // True if there is delimiter in the buffer after idx.
1341 bool m_delimited; 1356 bool m_delimited;
1342 1357
1343 // Longest lookahead required. 1358 // Longest lookahead required.
1369 { 1384 {
1370 m_buf = new char[m_bufsize]; 1385 m_buf = new char[m_bufsize];
1371 m_eob = m_buf + m_bufsize; 1386 m_eob = m_buf + m_bufsize;
1372 m_idx = m_eob; // refresh_buf shouldn't try to copy old data 1387 m_idx = m_eob; // refresh_buf shouldn't try to copy old data
1373 m_progress_marker = m_idx; 1388 m_progress_marker = m_idx;
1374 refresh_buf (); // load the first batch of data 1389 refresh_buf (true); // load the first batch of data
1375 } 1390 }
1376 1391
1377 // Used to create a stream from a strstream from data read from a dstr. 1392 // Used to create a stream from a strstream from data read from a dstr.
1378 delimited_stream::delimited_stream (std::istream& is, 1393 delimited_stream::delimited_stream (std::istream& is,
1379 const delimited_stream& ds) 1394 const delimited_stream& ds)
1385 // Seek to the correct position in i_stream. 1400 // Seek to the correct position in i_stream.
1386 if (! eof ()) 1401 if (! eof ())
1387 { 1402 {
1388 m_i_stream.clear (); 1403 m_i_stream.clear ();
1389 m_i_stream.seekg (m_buf_in_file); 1404 m_i_stream.seekg (m_buf_in_file);
1390 m_i_stream.read (m_buf, m_idx - m_buf); 1405 m_i_stream.read (m_buf, m_idx - m_buf - m_overlap);
1391 } 1406 }
1392 1407
1393 delete [] m_buf; 1408 delete [] m_buf;
1394 } 1409 }
1395 1410
1443 // new data to fill it. Return EOF if the file is at EOF before 1458 // new data to fill it. Return EOF if the file is at EOF before
1444 // reading any data and all of the data that has been read has been 1459 // reading any data and all of the data that has been read has been
1445 // processed. 1460 // processed.
1446 1461
1447 int 1462 int
1448 delimited_stream::refresh_buf (void) 1463 delimited_stream::refresh_buf (bool initialize)
1449 { 1464 {
1450 if (eof ()) 1465 if (eof ())
1451 return std::istream::traits_type::eof (); 1466 return std::istream::traits_type::eof ();
1452 1467
1453 int retval; 1468 int retval;
1454 1469
1455 if (m_eob < m_idx) 1470 if (m_eob < m_idx)
1456 m_idx = m_eob; 1471 m_idx = m_eob;
1457 1472
1458 std::size_t old_remaining = m_eob - m_idx; 1473 std::size_t old_remaining = m_eob - m_idx;
1474 std::size_t old_overlap = 0;
1475
1476 if (initialize || (m_idx - m_buf <= 0))
1477 m_overlap = 0;
1478 else
1479 {
1480 old_overlap = m_overlap;
1481 // Retain the last 25 bytes in the buffer. That should be more than enough
1482 // to putback an entire double precision floating point number in decimal
1483 // including 3 digit exponent and signs. Do we ever need to putback more
1484 // than that?
1485 m_overlap = 25;
1486 // Assure we don't "underflow" with the overlap
1487 m_overlap = std::min (m_overlap, m_idx - m_buf - 1);
1488 }
1459 1489
1460 octave_quit (); // allow ctrl-C 1490 octave_quit (); // allow ctrl-C
1461 1491
1462 if (old_remaining > 0) 1492 if (old_remaining + m_overlap > 0)
1463 { 1493 {
1464 m_buf_in_file += (m_idx - m_buf); 1494 m_buf_in_file += (m_idx - old_overlap - m_buf);
1465 memmove (m_buf, m_idx, old_remaining); 1495 std::memmove (m_buf, m_idx - m_overlap, m_overlap + old_remaining);
1466 } 1496 }
1467 else 1497 else
1468 m_buf_in_file = m_i_stream.tellg (); // record for destructor 1498 m_buf_in_file = m_i_stream.tellg (); // record for destructor
1469 1499
1470 m_progress_marker -= m_idx - m_buf; // where original idx would have been 1500 // where original idx would have been
1471 m_idx = m_buf; 1501 m_progress_marker -= m_idx - m_overlap - m_buf;
1502 m_idx = m_buf + m_overlap;
1472 1503
1473 int gcount; // chars read 1504 int gcount; // chars read
1474 if (! m_i_stream.eof ()) 1505 if (! m_i_stream.eof ())
1475 { 1506 {
1476 m_i_stream.read (m_buf + old_remaining, m_bufsize - old_remaining); 1507 m_i_stream.read (m_buf + m_overlap + old_remaining,
1508 m_bufsize - m_overlap - old_remaining);
1477 gcount = m_i_stream.gcount (); 1509 gcount = m_i_stream.gcount ();
1478 } 1510 }
1479 else 1511 else
1480 gcount = 0; 1512 gcount = 0;
1481 1513
1482 m_eob = m_buf + old_remaining + gcount; 1514 m_eob = m_buf + m_overlap + old_remaining + gcount;
1483 m_last = m_eob; 1515 m_last = m_eob;
1484 if (gcount == 0) 1516 if (gcount == 0)
1485 { 1517 {
1486 m_delimited = false; 1518 m_delimited = false;
1487 1519
1488 if (m_eob != m_buf) // no more data in file, but still some to go 1520 if (m_eob != m_buf + m_overlap)
1521 // no more data in file, but still some to go
1489 retval = 0; 1522 retval = 0;
1490 else 1523 else
1491 // file and buffer are both done. 1524 // file and buffer are both done.
1492 retval = std::istream::traits_type::eof (); 1525 retval = std::istream::traits_type::eof ();
1493 } 1526 }
1494 else 1527 else
1495 { 1528 {
1496 m_delimited = true; 1529 m_delimited = true;
1497 1530
1498 for (m_last = m_eob - m_longest; m_last - m_buf >= 0; m_last--) 1531 for (m_last = m_eob - m_longest; m_last - m_buf - m_overlap >= 0;
1532 m_last--)
1499 { 1533 {
1500 if (m_delims.find (*m_last) != std::string::npos) 1534 if (m_delims.find (*m_last) != std::string::npos)
1501 break; 1535 break;
1502 } 1536 }
1503 1537
1504 if (m_last < m_buf) 1538 if (m_last < m_buf + m_overlap)
1505 m_delimited = false; 1539 m_delimited = false;
1506 1540
1507 retval = 0; 1541 retval = 0;
1508 } 1542 }
1509 1543
1523 char * 1557 char *
1524 delimited_stream::read (char *buffer, int size, char* &prior_tell) 1558 delimited_stream::read (char *buffer, int size, char* &prior_tell)
1525 { 1559 {
1526 char *retval; 1560 char *retval;
1527 1561
1528 if (m_eob - m_idx > size) 1562 if (m_eob - m_idx >= size)
1529 { 1563 {
1530 retval = m_idx; 1564 retval = m_idx;
1531 m_idx += size; 1565 m_idx += size;
1532 if (m_idx > m_last) 1566 if (m_idx > m_last)
1533 m_delimited = false; 1567 m_delimited = false;
2612 buf_size = std::min (buf_size, std::max (ntimes, 80 * ntimes)); 2646 buf_size = std::min (buf_size, std::max (ntimes, 80 * ntimes));
2613 buf_size = std::max (buf_size, ntimes); 2647 buf_size = std::max (buf_size, ntimes);
2614 } 2648 }
2615 // Finally, create the stream. 2649 // Finally, create the stream.
2616 delimited_stream is (isp, 2650 delimited_stream is (isp,
2617 (m_delim_table.empty () ? m_whitespace + "\r\n" 2651 (m_delims.empty () ? m_whitespace + "\r\n"
2618 : m_delims), 2652 : m_delims),
2619 max_lookahead, buf_size); 2653 max_lookahead, buf_size);
2620 2654
2621 // Grow retval dynamically. "size" is half the initial size 2655 // Grow retval dynamically. "size" is half the initial size
2622 // (FIXME: Should we start smaller if ntimes is large?) 2656 // (FIXME: Should we start smaller if ntimes is large?)
2623 octave_idx_type size = ((ntimes < 8 && ntimes >= 0) ? ntimes : 1); 2657 octave_idx_type size = ((ntimes < 8 && ntimes >= 0) ? ntimes : 1);
2788 int sign = 1; 2822 int sign = 1;
2789 unsigned int width_left = fmt.width; 2823 unsigned int width_left = fmt.width;
2790 double retval = 0; 2824 double retval = 0;
2791 bool valid = false; // syntactically correct double? 2825 bool valid = false; // syntactically correct double?
2792 2826
2793 int ch = is.peek (); 2827 int ch = is.peek_undelim ();
2794 2828
2795 if (ch == '+') 2829 if (ch == '+')
2796 { 2830 {
2797 is.get (); 2831 is.get ();
2798 ch = is.peek (); 2832 ch = is.peek_undelim ();
2799 if (width_left) 2833 if (width_left)
2800 width_left--; 2834 width_left--;
2801 } 2835 }
2802 else if (ch == '-') 2836 else if (ch == '-')
2803 { 2837 {
2804 sign = -1; 2838 sign = -1;
2805 is.get (); 2839 is.get ();
2806 ch = is.peek (); 2840 ch = is.peek_undelim ();
2807 if (width_left) 2841 if (width_left)
2808 width_left--; 2842 width_left--;
2809 } 2843 }
2810 2844
2811 // Read integer part 2845 // Read integer part
2867 2901
2868 // look for exponent part in, e.g., 6.023E+23 2902 // look for exponent part in, e.g., 6.023E+23
2869 bool used_exp = false; 2903 bool used_exp = false;
2870 if (valid && width_left > 1 && m_exp_chars.find (ch) != std::string::npos) 2904 if (valid && width_left > 1 && m_exp_chars.find (ch) != std::string::npos)
2871 { 2905 {
2872 int ch1 = is.peek (); 2906 int ch1 = is.peek_undelim ();
2873 if (ch1 == '-' || ch1 == '+' || (ch1 >= '0' && ch1 <= '9')) 2907 if (ch1 == '-' || ch1 == '+' || (ch1 >= '0' && ch1 <= '9'))
2874 { 2908 {
2875 // if 1.0e+$ or some such, this will set failbit, as we want 2909 // if 1.0e+$ or some such, this will set failbit, as we want
2876 width_left--; // count "E" 2910 width_left--; // count "E"
2877 int exp = 0; 2911 int exp = 0;
2886 width_left--; 2920 width_left--;
2887 exp_sign = -1; 2921 exp_sign = -1;
2888 is.get (); 2922 is.get ();
2889 } 2923 }
2890 valid = false; 2924 valid = false;
2891 while (width_left-- && is && (ch = is.get ()) >= '0' && ch <= '9') 2925 while (width_left-- && is && (ch = is.get_undelim ()) >= '0' && ch <= '9')
2892 { 2926 {
2893 exp = exp*10 + ch - '0'; 2927 exp = exp*10 + ch - '0';
2894 valid = true; 2928 valid = true;
2895 } 2929 }
2896 width_left++; 2930 width_left++;
2909 is.clear (); 2943 is.clear ();
2910 if (! used_exp && ch != std::istream::traits_type::eof () && width_left) 2944 if (! used_exp && ch != std::istream::traits_type::eof () && width_left)
2911 is.putback (ch); 2945 is.putback (ch);
2912 2946
2913 // Check for +/- inf and NaN 2947 // Check for +/- inf and NaN
2914 if (! valid && width_left >= 3) 2948 if (! valid && width_left >= 3 && is.remaining () >= 3)
2915 { 2949 {
2916 int i = lookahead (is, m_inf_nan, 3, false); // false->case insensitive 2950 int i = lookahead (is, m_inf_nan, 3, false); // false->case insensitive
2917 if (i == 0) 2951 if (i == 0)
2918 { 2952 {
2919 retval = numeric_limits<double>::Inf (); 2953 retval = numeric_limits<double>::Inf ();
2945 double im = 0; 2979 double im = 0;
2946 double re = 0; 2980 double re = 0;
2947 bool as_empty = false; // did we fail but match a "treat_as_empty" string? 2981 bool as_empty = false; // did we fail but match a "treat_as_empty" string?
2948 bool inf = false; 2982 bool inf = false;
2949 2983
2950 int ch = is.peek (); 2984 int ch = is.peek_undelim ();
2951 if (ch == '+' || ch == '-') // check for [+-][ij] with no coefficients 2985 if (ch == '+' || ch == '-') // check for [+-][ij] with no coefficients
2952 { 2986 {
2953 ch = is.get (); 2987 ch = is.get ();
2954 int ch2 = is.peek (); 2988 int ch2 = is.peek_undelim ();
2955 if (ch2 == 'i' || ch2 == 'j') 2989 if (ch2 == 'i' || ch2 == 'j')
2956 { 2990 {
2957 double value = 1; 2991 double value = 1;
2958 is.get (); 2992 is.get ();
2959 // Check not -inf 2993 // Check not -inf
2960 if (is.peek () == 'n') 2994 if (is.peek_undelim () == 'n')
2961 { 2995 {
2962 char *pos = is.tellg (); 2996 char *pos = is.tellg ();
2963 std::ios::iostate state = is.rdstate (); 2997 std::ios::iostate state = is.rdstate ();
2964 2998
2965 is.get (); 2999 is.get ();
2966 ch2 = is.get (); 3000 ch2 = is.get_undelim ();
2967 if (ch2 == 'f') 3001 if (ch2 == 'f')
2968 { 3002 {
2969 inf = true; 3003 inf = true;
2970 re = (ch == '+' ? numeric_limits<double>::Inf () 3004 re = (ch == '+' ? numeric_limits<double>::Inf ()
2971 : -numeric_limits<double>::Inf ()); 3005 : -numeric_limits<double>::Inf ());
2972 value = 0; 3006 value = 0;
2973 } 3007 }
2974 else 3008 else
2975 { 3009 {
2976 is.clear (state); 3010 is.clear (state);
3011 // FIXME: Buffer might have refreshed.
3012 // pos might no longer be valid.
2977 is.seekg (pos); // reset to position before look-ahead 3013 is.seekg (pos); // reset to position before look-ahead
2978 } 3014 }
2979 } 3015 }
2980 3016
2981 im = (ch == '+') ? value : -value; 3017 im = (ch == '+') ? value : -value;
2987 if (! im && ! inf) // if not [+-][ij] or [+-]inf, read real normally 3023 if (! im && ! inf) // if not [+-][ij] or [+-]inf, read real normally
2988 { 3024 {
2989 char *pos = is.tellg (); 3025 char *pos = is.tellg ();
2990 std::ios::iostate state = is.rdstate (); 3026 std::ios::iostate state = is.rdstate ();
2991 //re = read_value<double> (is); 3027 //re = read_value<double> (is);
3028 // FIXME: read_double might refresh the buffer. So seekg might be off.
2992 re = read_double (is, fmt); 3029 re = read_double (is, fmt);
2993 3030
2994 // check for "treat as empty" string 3031 // check for "treat as empty" string
2995 if (m_treat_as_empty.numel () 3032 if (m_treat_as_empty.numel ()
2996 && (is.fail () || math::is_NaN_or_NA (Complex (re)) 3033 && (is.fail () || math::is_NaN_or_NA (Complex (re))
3038 3075
3039 if (! is.eof () && ! as_empty) 3076 if (! is.eof () && ! as_empty)
3040 { 3077 {
3041 state = is.rdstate (); // before tellg, since that fails at EOF 3078 state = is.rdstate (); // before tellg, since that fails at EOF
3042 3079
3043 ch = is.peek (); // ch == EOF if read failed; no need to chk fail 3080 ch = is.peek_undelim ();
3081 // ch == EOF if read failed; no need to chk fail
3044 if (ch == 'i' || ch == 'j') // pure imaginary 3082 if (ch == 'i' || ch == 'j') // pure imaginary
3045 { 3083 {
3046 is.get (); 3084 is.get ();
3047 im = re; 3085 im = re;
3048 re = 0; 3086 re = 0;
3052 // save stream state in case we have to restore it 3090 // save stream state in case we have to restore it
3053 pos = is.tellg (); 3091 pos = is.tellg ();
3054 state = is.rdstate (); 3092 state = is.rdstate ();
3055 3093
3056 //im = read_value<double> (is); 3094 //im = read_value<double> (is);
3095 // FIXME: read_double might refresh the buffer.
3096 // So seekg might be off after this.
3057 im = read_double (is, fmt); 3097 im = read_double (is, fmt);
3058 if (is.fail ()) 3098 if (is.fail ())
3059 im = 1; 3099 im = 1;
3060 3100
3061 if (is.peek () == 'i' || is.peek () == 'j') 3101 if (is.peek_undelim () == 'i' || is.peek_undelim () == 'j')
3062 is.get (); 3102 is.get ();
3063 else 3103 else
3064 { 3104 {
3065 im = 0; // no valid imaginary part. Restore state 3105 im = 0; // no valid imaginary part. Restore state
3066 is.clear (state); // eof shouldn't cause fail. 3106 is.clear (state); // eof shouldn't cause fail.
3167 // Grow string in an exponential fashion if necessary. 3207 // Grow string in an exponential fashion if necessary.
3168 if (i >= val.length ()) 3208 if (i >= val.length ())
3169 val.append (std::max (val.length (), 3209 val.append (std::max (val.length (),
3170 static_cast<std::size_t> (16)), '\0'); 3210 static_cast<std::size_t> (16)), '\0');
3171 3211
3172 int ch = is.get (); 3212 int ch = is.get_undelim ();
3173 if (is_delim (ch) || ch == std::istream::traits_type::eof ()) 3213 if (is_delim (ch) || ch == std::istream::traits_type::eof ())
3174 { 3214 {
3175 is.putback (ch); 3215 is.putback (ch);
3176 break; 3216 break;
3177 } 3217 }
3225 textscan::scan_qstring (delimited_stream& is, const textscan_format_elt& fmt, 3265 textscan::scan_qstring (delimited_stream& is, const textscan_format_elt& fmt,
3226 std::string& val) 3266 std::string& val)
3227 { 3267 {
3228 skip_whitespace (is); 3268 skip_whitespace (is);
3229 3269
3230 if (is.peek () != '"') 3270 if (is.peek_undelim () != '"')
3231 scan_string (is, fmt, val); 3271 scan_string (is, fmt, val);
3232 else 3272 else
3233 { 3273 {
3234 is.get (); 3274 is.get ();
3235 scan_caret (is, R"(")", val); // read everything until " 3275 scan_caret (is, R"(")", val); // read everything until "
3506 3546
3507 if (! is.eof ()) 3547 if (! is.eof ())
3508 { 3548 {
3509 if (m_delim_list.isempty ()) 3549 if (m_delim_list.isempty ())
3510 { 3550 {
3511 if (! is_delim (is.peek ())) 3551 if (! is_delim (is.peek_undelim ()))
3512 this_conversion_failed = true; 3552 this_conversion_failed = true;
3513 } 3553 }
3514 else // Cell array of multi-character delimiters 3554 else // Cell array of multi-character delimiters
3515 { 3555 {
3516 char *pos = is.tellg (); 3556 char *pos = is.tellg ();
3930 { 3970 {
3931 if (is_delim (c1) || c1 == m_eol1 || c1 == m_eol2) 3971 if (is_delim (c1) || c1 == m_eol1 || c1 == m_eol2)
3932 { 3972 {
3933 is.get (); 3973 is.get ();
3934 if (c1 == m_eol1 && is.peek_undelim () == m_eol2) 3974 if (c1 == m_eol1 && is.peek_undelim () == m_eol2)
3935 is.get_undelim (); // if \r\n, skip the \n too. 3975 is.get (); // if \r\n, skip the \n too.
3936 3976
3937 if (multiple_delims_as_one) 3977 if (multiple_delims_as_one)
3938 { 3978 {
3939 int prev = -1; 3979 int prev = -1;
3940 // skip multiple delims. 3980 // skip multiple delims.