171
|
1 /* search.c -- How to search large bodies of text. */ |
|
2 |
|
3 /* This file is part of GNU Info, a program for reading online documentation |
|
4 stored in Info format. |
|
5 |
|
6 Copyright (C) 1993 Free Software Foundation, Inc. |
|
7 |
|
8 This program is free software; you can redistribute it and/or modify |
|
9 it under the terms of the GNU General Public License as published by |
|
10 the Free Software Foundation; either version 2, or (at your option) |
|
11 any later version. |
|
12 |
|
13 This program is distributed in the hope that it will be useful, |
|
14 but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
16 GNU General Public License for more details. |
|
17 |
|
18 You should have received a copy of the GNU General Public License |
|
19 along with this program; if not, write to the Free Software |
|
20 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
|
21 |
|
22 Written by Brian Fox (bfox@ai.mit.edu). */ |
|
23 |
1242
|
24 #ifdef HAVE_CONFIG_H |
|
25 #include <config.h> |
|
26 #endif |
|
27 |
171
|
28 #include <ctype.h> |
|
29 #include <sys/types.h> |
|
30 #include <sys/stat.h> |
|
31 #include "general.h" |
|
32 #include "search.h" |
|
33 #include "nodes.h" |
|
34 |
|
35 #if !defined (NULL) |
|
36 # define NULL 0x0 |
|
37 #endif /* !NULL */ |
|
38 |
|
39 /* The search functions take two arguments: |
|
40 |
|
41 1) a string to search for, and |
|
42 |
|
43 2) a pointer to a SEARCH_BINDING which contains the buffer, start, |
|
44 and end of the search. |
|
45 |
|
46 They return a long, which is the offset from the start of the buffer |
|
47 at which the match was found. An offset of -1 indicates failure. */ |
|
48 |
|
49 /* A function which makes a binding with buffer and bounds. */ |
|
50 SEARCH_BINDING * |
|
51 make_binding (buffer, start, end) |
|
52 char *buffer; |
|
53 long start, end; |
|
54 { |
|
55 SEARCH_BINDING *binding; |
|
56 |
|
57 binding = (SEARCH_BINDING *)xmalloc (sizeof (SEARCH_BINDING)); |
|
58 binding->buffer = buffer; |
|
59 binding->start = start; |
|
60 binding->end = end; |
|
61 binding->flags = 0; |
|
62 |
|
63 return (binding); |
|
64 } |
|
65 |
|
66 /* Make a copy of BINDING without duplicating the data. */ |
|
67 SEARCH_BINDING * |
|
68 copy_binding (binding) |
|
69 SEARCH_BINDING *binding; |
|
70 { |
|
71 SEARCH_BINDING *copy; |
|
72 |
|
73 copy = make_binding (binding->buffer, binding->start, binding->end); |
|
74 copy->flags = binding->flags; |
|
75 return (copy); |
|
76 } |
|
77 |
|
78 |
|
79 /* **************************************************************** */ |
|
80 /* */ |
|
81 /* The Actual Searching Functions */ |
|
82 /* */ |
|
83 /* **************************************************************** */ |
|
84 |
|
85 /* Search forwards or backwards for the text delimited by BINDING. |
|
86 The search is forwards if BINDING->start is greater than BINDING->end. */ |
|
87 long |
|
88 search (string, binding) |
|
89 char *string; |
|
90 SEARCH_BINDING *binding; |
|
91 { |
|
92 long result; |
|
93 |
|
94 /* If the search is backwards, then search backwards, otherwise forwards. */ |
|
95 if (binding->start > binding->end) |
|
96 result = search_backward (string, binding); |
|
97 else |
|
98 result = search_forward (string, binding); |
|
99 |
|
100 return (result); |
|
101 } |
|
102 |
|
103 /* Search forwards for STRING through the text delimited in BINDING. */ |
|
104 long |
|
105 search_forward (string, binding) |
|
106 char *string; |
|
107 SEARCH_BINDING *binding; |
|
108 { |
|
109 register int c, i, len; |
|
110 register char *buff, *end; |
|
111 char *alternate = (char *)NULL; |
|
112 |
|
113 len = strlen (string); |
|
114 |
|
115 /* We match characters in the search buffer against STRING and ALTERNATE. |
|
116 ALTERNATE is a case reversed version of STRING; this is cheaper than |
|
117 case folding each character before comparison. Alternate is only |
|
118 used if the case folding bit is turned on in the passed BINDING. */ |
|
119 |
|
120 if (binding->flags & S_FoldCase) |
|
121 { |
|
122 alternate = savestring (string); |
|
123 |
|
124 for (i = 0; i < len; i++) |
|
125 { |
|
126 if (islower (alternate[i])) |
|
127 alternate[i] = toupper (alternate[i]); |
|
128 else if (isupper (alternate[i])) |
|
129 alternate[i] = tolower (alternate[i]); |
|
130 } |
|
131 } |
|
132 |
|
133 buff = binding->buffer + binding->start; |
|
134 end = binding->buffer + binding->end + 1; |
|
135 |
|
136 while (buff < (end - len)) |
|
137 { |
|
138 for (i = 0; i < len; i++) |
|
139 { |
|
140 c = buff[i]; |
|
141 |
|
142 if ((c != string[i]) && (!alternate || c != alternate[i])) |
|
143 break; |
|
144 } |
|
145 |
|
146 if (!string[i]) |
|
147 { |
|
148 if (alternate) |
|
149 free (alternate); |
|
150 if (binding->flags & S_SkipDest) |
|
151 buff += len; |
|
152 return ((long) (buff - binding->buffer)); |
|
153 } |
|
154 |
|
155 buff++; |
|
156 } |
|
157 |
|
158 if (alternate) |
|
159 free (alternate); |
|
160 |
|
161 return ((long) -1); |
|
162 } |
|
163 |
|
164 /* Search for STRING backwards through the text delimited in BINDING. */ |
|
165 long |
|
166 search_backward (input_string, binding) |
|
167 char *input_string; |
|
168 SEARCH_BINDING *binding; |
|
169 { |
|
170 register int c, i, len; |
|
171 register char *buff, *end; |
|
172 char *string; |
|
173 char *alternate = (char *)NULL; |
|
174 |
|
175 len = strlen (input_string); |
|
176 |
|
177 /* Reverse the characters in the search string. */ |
|
178 string = (char *)xmalloc (1 + len); |
|
179 for (c = 0, i = len - 1; input_string[c]; c++, i--) |
|
180 string[i] = input_string[c]; |
|
181 |
|
182 string[c] = '\0'; |
|
183 |
|
184 /* We match characters in the search buffer against STRING and ALTERNATE. |
|
185 ALTERNATE is a case reversed version of STRING; this is cheaper than |
|
186 case folding each character before comparison. ALTERNATE is only |
|
187 used if the case folding bit is turned on in the passed BINDING. */ |
|
188 |
|
189 if (binding->flags & S_FoldCase) |
|
190 { |
|
191 alternate = savestring (string); |
|
192 |
|
193 for (i = 0; i < len; i++) |
|
194 { |
|
195 if (islower (alternate[i])) |
|
196 alternate[i] = toupper (alternate[i]); |
|
197 else if (isupper (alternate[i])) |
|
198 alternate[i] = tolower (alternate[i]); |
|
199 } |
|
200 } |
|
201 |
|
202 buff = binding->buffer + binding->start; |
|
203 end = binding->buffer + binding->end; |
|
204 |
|
205 while (buff > end + len) |
|
206 { |
|
207 for (i = 0; i < len; i++) |
|
208 { |
|
209 c = *(buff - i); |
|
210 |
|
211 if (c != string[i] && (alternate && c != alternate[i])) |
|
212 break; |
|
213 } |
|
214 |
|
215 if (!string[i]) |
|
216 { |
|
217 free (string); |
|
218 if (alternate) |
|
219 free (alternate); |
|
220 |
|
221 if (binding->flags & S_SkipDest) |
|
222 buff -= len; |
|
223 return ((long) (1 + (buff - binding->buffer))); |
|
224 } |
|
225 |
|
226 buff--; |
|
227 } |
|
228 |
|
229 free (string); |
|
230 if (alternate) |
|
231 free (alternate); |
|
232 |
|
233 return ((long) -1); |
|
234 } |
|
235 |
|
236 /* Find STRING in LINE, returning the offset of the end of the string. |
|
237 Return an offset of -1 if STRING does not appear in LINE. The search |
|
238 is bound by the end of the line (i.e., either NEWLINE or 0). */ |
|
239 int |
|
240 string_in_line (string, line) |
|
241 char *string, *line; |
|
242 { |
|
243 register int end; |
|
244 SEARCH_BINDING binding; |
|
245 |
|
246 /* Find the end of the line. */ |
|
247 for (end = 0; line[end] && line[end] != '\n'; end++); |
|
248 |
|
249 /* Search for STRING within these confines. */ |
|
250 binding.buffer = line; |
|
251 binding.start = 0; |
|
252 binding.end = end; |
|
253 binding.flags = S_FoldCase | S_SkipDest; |
|
254 |
|
255 return (search_forward (string, &binding)); |
|
256 } |
|
257 |
|
258 /* Return non-zero if STRING is the first text to appear at BINDING. */ |
|
259 int |
|
260 looking_at (string, binding) |
|
261 char *string; |
|
262 SEARCH_BINDING *binding; |
|
263 { |
|
264 long search_end; |
|
265 |
|
266 search_end = search (string, binding); |
|
267 |
|
268 /* If the string was not found, SEARCH_END is -1. If the string was found, |
|
269 but not right away, SEARCH_END is != binding->start. Otherwise, the |
|
270 string was found at binding->start. */ |
|
271 return (search_end == binding->start); |
|
272 } |
|
273 |
|
274 /* **************************************************************** */ |
|
275 /* */ |
|
276 /* Small String Searches */ |
|
277 /* */ |
|
278 /* **************************************************************** */ |
|
279 |
|
280 /* Function names that start with "skip" are passed a string, and return |
|
281 an offset from the start of that string. Function names that start |
|
282 with "find" are passed a SEARCH_BINDING, and return an absolute position |
|
283 marker of the item being searched for. "Find" functions return a value |
|
284 of -1 if the item being looked for couldn't be found. */ |
|
285 |
|
286 /* Return the index of the first non-whitespace character in STRING. */ |
|
287 int |
|
288 skip_whitespace (string) |
|
289 char *string; |
|
290 { |
|
291 register int i; |
|
292 |
|
293 for (i = 0; string && whitespace (string[i]); i++); |
|
294 return (i); |
|
295 } |
|
296 |
|
297 /* Return the index of the first non-whitespace or newline character in |
|
298 STRING. */ |
|
299 int |
|
300 skip_whitespace_and_newlines (string) |
|
301 char *string; |
|
302 { |
|
303 register int i; |
|
304 |
|
305 for (i = 0; string && (whitespace (string[i]) || string[i] == '\n'); i++); |
|
306 return (i); |
|
307 } |
|
308 |
|
309 /* Return the index of the first whitespace character in STRING. */ |
|
310 int |
|
311 skip_non_whitespace (string) |
|
312 char *string; |
|
313 { |
|
314 register int i; |
|
315 |
|
316 for (i = 0; string && !whitespace (string[i]); i++); |
|
317 return (i); |
|
318 } |
|
319 |
|
320 /* Return the index of the first non-node character in STRING. Note that |
|
321 this function contains quite a bit of hair to ignore periods in some |
|
322 special cases. This is because we here at GNU ship some info files which |
|
323 contain nodenames that contain periods. No such nodename can start with |
|
324 a period, or continue with whitespace, newline, or ')' immediately following |
|
325 the period. If second argument NEWLINES_OKAY is non-zero, newlines should |
|
326 be skipped while parsing out the nodename specification. */ |
|
327 int |
|
328 skip_node_characters (string, newlines_okay) |
|
329 char *string; |
|
330 int newlines_okay; |
|
331 { |
|
332 register int c, i = 0; |
|
333 int paren_seen = 0; |
|
334 int paren = 0; |
|
335 |
|
336 /* Handle special case. This is when another function has parsed out the |
|
337 filename component of the node name, and we just want to parse out the |
|
338 nodename proper. In that case, a period at the start of the nodename |
|
339 indicates an empty nodename. */ |
|
340 if (string && *string == '.') |
|
341 return (0); |
|
342 |
|
343 if (string && *string == '(') |
|
344 { |
|
345 paren++; |
|
346 paren_seen++; |
|
347 i++; |
|
348 } |
|
349 |
|
350 for (; string && (c = string[i]); i++) |
|
351 { |
|
352 if (paren) |
|
353 { |
|
354 if (c == '(') |
|
355 paren++; |
|
356 else if (c == ')') |
|
357 paren--; |
|
358 |
|
359 continue; |
|
360 } |
|
361 |
|
362 /* If the character following the close paren is a space or period, |
|
363 then this node name has no more characters associated with it. */ |
|
364 if (c == '\t' || |
|
365 c == ',' || |
|
366 c == INFO_TAGSEP || |
|
367 ((!newlines_okay) && (c == '\n')) || |
|
368 ((paren_seen && string[i - 1] == ')') && |
|
369 (c == ' ' || c == '.')) || |
|
370 (c == '.' && |
|
371 ((!string[i + 1]) || |
|
372 (whitespace_or_newline (string[i + 1])) || |
|
373 (string[i + 1] == ')')))) |
|
374 break; |
|
375 } |
|
376 return (i); |
|
377 } |
|
378 |
306
|
379 #ifndef HAVE_STRICMP |
171
|
380 int |
|
381 stricmp (string1, string2) |
|
382 char *string1, *string2; |
|
383 { |
|
384 char ch1, ch2; |
|
385 |
|
386 for (;;) |
|
387 { |
|
388 ch1 = *string1++; |
|
389 ch2 = *string2++; |
|
390 |
|
391 if (!(ch1 | ch2)) |
|
392 return (0); |
|
393 |
|
394 ch1 = info_toupper (ch1); |
|
395 ch2 = info_toupper (ch2); |
|
396 |
|
397 if (ch1 != ch2) |
|
398 return (ch1 - ch2); |
|
399 } |
|
400 } |
306
|
401 #endif |
171
|
402 |
307
|
403 #ifndef HAVE_STRNICMP |
171
|
404 /* Compare at most COUNT characters from string1 to string2. Case |
|
405 doesn't matter. */ |
|
406 int |
|
407 strnicmp (string1, string2, count) |
|
408 char *string1, *string2; |
|
409 int count; |
|
410 { |
|
411 register char ch1, ch2; |
|
412 |
|
413 while (count) |
|
414 { |
|
415 ch1 = *string1++; |
|
416 ch2 = *string2++; |
|
417 |
|
418 ch1 = info_toupper (ch1); |
|
419 ch2 = info_toupper (ch2); |
|
420 |
|
421 if (ch1 == ch2) |
|
422 count--; |
|
423 else |
|
424 break; |
|
425 } |
|
426 return (count); |
|
427 } |
306
|
428 #endif |
171
|
429 |
|
430 /* **************************************************************** */ |
|
431 /* */ |
|
432 /* Searching FILE_BUFFER's */ |
|
433 /* */ |
|
434 /* **************************************************************** */ |
|
435 |
|
436 /* Return the absolute position of the first occurence of a node separator in |
|
437 BINDING-buffer. The search starts at BINDING->start. Return -1 if no node |
|
438 separator was found. */ |
|
439 long |
|
440 find_node_separator (binding) |
|
441 SEARCH_BINDING *binding; |
|
442 { |
|
443 register long i; |
|
444 char *body; |
|
445 |
|
446 body = binding->buffer; |
|
447 |
|
448 /* A node is started by [^L]^_[^L]\n. That is to say, the C-l's are |
|
449 optional, but the DELETE and NEWLINE are not. This separator holds |
|
450 true for all separated elements in an Info file, including the tags |
|
451 table (if present) and the indirect tags table (if present). */ |
|
452 for (i = binding->start; i < binding->end - 1; i++) |
|
453 if (((body[i] == INFO_FF && body[i + 1] == INFO_COOKIE) && |
|
454 (body[i + 2] == '\n' || |
|
455 (body[i + 2] == INFO_FF && body[i + 3] == '\n'))) || |
|
456 ((body[i] == INFO_COOKIE) && |
|
457 (body[i + 1] == '\n' || |
|
458 (body[i + 1] == INFO_FF && body[i + 2] == '\n')))) |
|
459 return (i); |
|
460 return (-1); |
|
461 } |
|
462 |
|
463 /* Return the length of the node separator characters that BODY is |
|
464 currently pointing at. */ |
|
465 int |
|
466 skip_node_separator (body) |
|
467 char *body; |
|
468 { |
|
469 register int i; |
|
470 |
|
471 i = 0; |
|
472 |
|
473 if (body[i] == INFO_FF) |
|
474 i++; |
|
475 |
|
476 if (body[i++] != INFO_COOKIE) |
|
477 return (0); |
|
478 |
|
479 if (body[i] == INFO_FF) |
|
480 i++; |
|
481 |
|
482 if (body[i++] != '\n') |
|
483 return (0); |
|
484 |
|
485 return (i); |
|
486 } |
|
487 |
|
488 /* Return the number of characters from STRING to the start of |
|
489 the next line. */ |
|
490 int |
|
491 skip_line (string) |
|
492 char *string; |
|
493 { |
|
494 register int i; |
|
495 |
|
496 for (i = 0; string && string[i] && string[i] != '\n'; i++); |
|
497 |
|
498 if (string[i] == '\n') |
|
499 i++; |
|
500 |
|
501 return (i); |
|
502 } |
|
503 |
|
504 /* Return the absolute position of the beginning of a tags table in this |
|
505 binding starting the search at binding->start. */ |
|
506 long |
|
507 find_tags_table (binding) |
|
508 SEARCH_BINDING *binding; |
|
509 { |
|
510 SEARCH_BINDING search; |
|
511 long position; |
|
512 |
|
513 search.buffer = binding->buffer; |
|
514 search.start = binding->start; |
|
515 search.end = binding->end; |
|
516 search.flags = S_FoldCase; |
|
517 |
|
518 while ((position = find_node_separator (&search)) != -1 ) |
|
519 { |
|
520 search.start = position; |
|
521 search.start += skip_node_separator (search.buffer + search.start); |
|
522 |
|
523 if (looking_at (TAGS_TABLE_BEG_LABEL, &search)) |
|
524 return (position); |
|
525 } |
|
526 return (-1); |
|
527 } |
|
528 |
|
529 /* Return the absolute position of the node named NODENAME in BINDING. |
|
530 This is a brute force search, and we wish to avoid it when possible. |
|
531 This function is called when a tag (indirect or otherwise) doesn't |
|
532 really point to the right node. It returns the absolute position of |
|
533 the separator preceding the node. */ |
|
534 long |
|
535 find_node_in_binding (nodename, binding) |
|
536 char *nodename; |
|
537 SEARCH_BINDING *binding; |
|
538 { |
|
539 register long position; |
|
540 register int offset, namelen; |
|
541 SEARCH_BINDING search; |
|
542 |
|
543 namelen = strlen (nodename); |
|
544 |
|
545 search.buffer = binding->buffer; |
|
546 search.start = binding->start; |
|
547 search.end = binding->end; |
|
548 search.flags = 0; |
|
549 |
|
550 while ((position = find_node_separator (&search)) != -1) |
|
551 { |
|
552 search.start = position; |
|
553 search.start += skip_node_separator (search.buffer + search.start); |
|
554 |
|
555 offset = string_in_line (INFO_NODE_LABEL, search.buffer + search.start); |
|
556 |
|
557 if (offset == -1) |
|
558 continue; |
|
559 |
|
560 search.start += offset; |
|
561 search.start += skip_whitespace (search.buffer + search.start); |
|
562 offset = skip_node_characters |
|
563 (search.buffer + search.start, DONT_SKIP_NEWLINES); |
|
564 |
|
565 /* Notice that this is an exact match. You cannot grovel through |
|
566 the buffer with this function looking for random nodes. */ |
|
567 if ((offset == namelen) && |
|
568 (search.buffer[search.start] == nodename[0]) && |
|
569 (strncmp (search.buffer + search.start, nodename, offset) == 0)) |
|
570 return (position); |
|
571 } |
|
572 return (-1); |
|
573 } |