view scripts/strings/findstr.m @ 8920:eb63fbe60fab

update copyright notices
author John W. Eaton <jwe@octave.org>
date Sat, 07 Mar 2009 10:41:27 -0500
parents bc982528de11
children 95c3e38098bf
line wrap: on
line source

## Copyright (C) 1996, 1999, 2000, 2002, 2003, 2005, 2006, 2007, 2008, 2009
##               Kurt Hornik
##
## This file is part of Octave.
##
## Octave is free software; you can redistribute it and/or modify it
## under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 3 of the License, or (at
## your option) any later version.
##
## Octave is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with Octave; see the file COPYING.  If not, see
## <http://www.gnu.org/licenses/>.

## -*- texinfo -*-
## @deftypefn {Function File} {} findstr (@var{s}, @var{t}, @var{overlap})
## Return the vector of all positions in the longer of the two strings
## @var{s} and @var{t} where an occurrence of the shorter of the two starts.
## If the optional argument @var{overlap} is nonzero, the returned vector
## can include overlapping positions (this is the default).  For example,
##
## @example
## @group
## findstr ("ababab", "a")
##      @result{} [1, 3, 5]
## findstr ("abababa", "aba", 0)
##      @result{} [1, 5]
## @end group
## @end example
## @seealso{strfind, strmatch, strcmp, strncmp, strcmpi, strncmpi, find}
## @end deftypefn

## Note that this implementation swaps the strings if second one is longer
## than the first, so try to put the longer one first.
##
## Author: Kurt Hornik <Kurt.Hornik@wu-wien.ac.at>
## Adapted-By: jwe

function v = findstr (s, t, overlap)

  if (nargin < 2 || nargin > 3)
    print_usage ();
  endif

  if (all (size (s) > 1) || all (size (t) > 1))
    error ("findstr: arguments must have only one non-singleton dimension");
  endif

  if (nargin == 2)
    overlap = 1;
  endif

  ## Make S be the longer string.
  if (length (s) < length (t))
    tmp = s;
    s = t;
    t = tmp;
  endif
  
  l_s = length (s);
  l_t = length (t);
  
  if (l_t == 0)
    ## zero length target: return empty set
    v = [];
    
  elseif (l_t == 1)
    ## length one target: simple find
    v = find (s == t);
    
  elseif (l_t == 2)
    ## length two target: find first at i and second at i+1
    v = find (s(1:l_s-1) == t(1) & s(2:l_s) == t(2));
    
  else
    ## length three or more: match the first three by find then go through
    ## the much smaller list to determine which of them are real matches
    limit = l_s - l_t + 1;
    v = find (s(1:limit) == t(1)
	      & s(2:limit+1) == t(2)
	      & s (3:limit+2) == t(3));
  endif

  ## Need to search the index vector if our find was too short
  ## (target length > 3), or if we don't allow overlaps.  Note though
  ## that there cannot be any overlaps if the first character in the
  ## target is different from the remaining characters in the target,
  ## so a single character, two different characters, or first character
  ## different from the second two don't need to be searched.
  if (l_t >= 3 || (! overlap && l_t > 1 && any (t(1) == t(2:l_t))))
    ## force strings to be both row vectors or both column vectors
    if (all (size (s) != size (t)))
      t = t.';
    endif
    
    ## determine which ones to keep
    keep = zeros (size (v));
    ind = 0:l_t-1;
    if (overlap)
      for idx = 1:length (v)
	keep(idx) = all (s(v(idx) + ind) == t);
      endfor
    else
      ## First possible position for next non-overlapping match.
      next = 1;
      for idx = 1:length (v)
	if (v(idx) >= next && s(v(idx) + ind) == t)
	  keep(idx) = 1;
	  ## Skip to the next possible match position.
	  next = v(idx) + l_t;
	else
	  keep(idx) = 0;
	endif
      endfor
    endif
    if (! isempty (v))
      v = v(find (keep));
    endif
  endif

  if (isempty (v))
    v = [];
  endif

  ## Always return a column vector, because that's what the old one did.
  if (rows (v) > 1) 
    v = v.';
  endif

endfunction

%!assert ((findstr ("abababa", "a") == [1, 3, 5, 7]
%! && findstr ("abababa", "aba") == [1, 3, 5]
%! && findstr ("abababa", "aba", 0) == [1, 5]));

%!error findstr ();

%!error findstr ("foo", "bar", 3, 4);