# HG changeset patch # User Markus Mützel # Date 1571337663 -7200 # Node ID 7dc31256c5e48e45ee34a67dc4866327e19419f9 # Parent d389416f0e50fe2a8c424765382886e30ad5001a Document that regexp* functions need UTF-8 encoded input (bug #35910). * regexp.cc (Fregexp, Fregexpi, Fregexpreg): Document that the input strings must be UTF-8 encoded. * NEWS: Announce support for UTF-8 encoded strings in regexp* functions. diff -r d389416f0e50 -r 7dc31256c5e4 NEWS --- a/NEWS Mon Oct 21 11:50:20 2019 -0400 +++ b/NEWS Thu Oct 17 20:41:03 2019 +0200 @@ -40,6 +40,12 @@ Octave:colon-complex-argument : when any arg is complex Octave:colon-nonscalar-argument : when any arg is non-scalar +- The `regexp` and related functions now correctly handle and *require* + strings in UTF-8 encoding. As with any other function that requires + strings to be encoded in Octave's native encoding, you can use + "native2unicode" to convert from your preferred locale. For example, + the copyright symbol in UTF-8 is `native2unicode (169, "latin1")`. + #### Graphics backend - Graphic primitives now accept a color property value of `"none"` diff -r d389416f0e50 -r 7dc31256c5e4 libinterp/corefcn/regexp.cc --- a/libinterp/corefcn/regexp.cc Mon Oct 21 11:50:20 2019 -0400 +++ b/libinterp/corefcn/regexp.cc Thu Oct 17 20:41:03 2019 +0200 @@ -662,8 +662,8 @@ @deftypefnx {} {[@dots{}] =} regexp (@var{str}, @var{pat}, "@var{opt1}", @dots{}) Regular expression string matching. -Search for @var{pat} in @var{str} and return the positions and substrings of -any matches, or empty values if there are none. +Search for @var{pat} in UTF-8 encoded @var{str} and return the positions and +substrings of any matches, or empty values if there are none. The matched pattern @var{pat} can include any of the standard regex operators, including: @@ -1195,9 +1195,9 @@ Case insensitive regular expression string matching. -Search for @var{pat} in @var{str} and return the positions and substrings of -any matches, or empty values if there are none. @xref{XREFregexp,,regexp}, -for details on the syntax of the search pattern. +Search for @var{pat} in UTF-8 encoded @var{str} and return the positions and +substrings of any matches, or empty values if there are none. +@xref{XREFregexp,,regexp}, for details on the syntax of the search pattern. @seealso{regexp} @end deftypefn */) { @@ -1396,6 +1396,8 @@ The pattern is a regular expression as documented for @code{regexp}. @xref{XREFregexp,,regexp}. +All strings must be UTF-8 encoded. + The replacement string may contain @code{$i}, which substitutes for the ith set of parentheses in the match string. For example,