Mercurial > gnulib
annotate lib/unigbrk/u-grapheme-breaks.h @ 40057:b06060465f09
maint: Run 'make update-copyright'
author | Paul Eggert <eggert@cs.ucla.edu> |
---|---|
date | Tue, 01 Jan 2019 00:25:11 +0100 |
parents | 10eb9086bea0 |
children |
rev | line source |
---|---|
19452
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
1 /* Grapheme cluster break function. |
40057
b06060465f09
maint: Run 'make update-copyright'
Paul Eggert <eggert@cs.ucla.edu>
parents:
19484
diff
changeset
|
2 Copyright (C) 2010-2019 Free Software Foundation, Inc. |
19452
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
3 Written by Ben Pfaff <blp@cs.stanford.edu>, 2010. |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
4 |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
5 This program is free software: you can redistribute it and/or modify it |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
6 under the terms of the GNU Lesser General Public License as published |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
7 by the Free Software Foundation; either version 3 of the License, or |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
8 (at your option) any later version. |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
9 |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
10 This program is distributed in the hope that it will be useful, |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
11 but WITHOUT ANY WARRANTY; without even the implied warranty of |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
13 Lesser General Public License for more details. |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
14 |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
15 You should have received a copy of the GNU Lesser General Public License |
19461
c1cbd8206d4b
all: Replace more http URLs by https URLs.
Bruno Haible <bruno@clisp.org>
parents:
19452
diff
changeset
|
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */ |
19452
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
17 |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
18 void |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
19 FUNC (const UNIT *s, size_t n, char *p) |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
20 { |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
21 if (n > 0) |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
22 { |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
23 const UNIT *s_end = s + n; |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
24 |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
25 /* Grapheme Cluster break property of the last character. |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
26 -1 at the very beginning of the string. */ |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
27 int last_char_prop = -1; |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
28 |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
29 /* Grapheme Cluster break property of the last complex character. |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
30 -1 at the very beginning of the string. */ |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
31 int last_compchar_prop = -1; |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
32 |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
33 size_t ri_count = 0; |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
34 |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
35 /* Don't break inside multibyte characters. */ |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
36 memset (p, 0, n); |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
37 |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
38 while (s < s_end) |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
39 { |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
40 ucs4_t uc; |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
41 int count = U_MBTOUC (&uc, s, s_end - s); |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
42 int prop = uc_graphemeclusterbreak_property (uc); |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
43 |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
44 /* Break at the start of the string (GB1). */ |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
45 if (last_char_prop < 0) |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
46 *p = 1; |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
47 else |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
48 { |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
49 /* No break between CR and LF (GB3). */ |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
50 if (last_char_prop == GBP_CR && prop == GBP_LF) |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
51 /* *p = 0 */; |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
52 /* Break before and after newlines (GB4, GB5). */ |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
53 else if ((last_char_prop == GBP_CR |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
54 || last_char_prop == GBP_LF |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
55 || last_char_prop == GBP_CONTROL) |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
56 || (prop == GBP_CR |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
57 || prop == GBP_LF |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
58 || prop == GBP_CONTROL)) |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
59 *p = 1; |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
60 /* No break between Hangul syllable sequences (GB6, GB7, GB8). */ |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
61 else if ((last_char_prop == GBP_L |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
62 && (prop == GBP_L |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
63 || prop == GBP_V |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
64 || prop == GBP_LV |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
65 || prop == GBP_LVT)) |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
66 || ((last_char_prop == GBP_LV |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
67 || last_char_prop == GBP_V) |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
68 && (prop == GBP_V |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
69 || prop == GBP_T)) |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
70 || ((last_char_prop == GBP_LVT |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
71 || last_char_prop == GBP_T) |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
72 && prop == GBP_T)) |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
73 /* *p = 0 */; |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
74 /* No break before extending characters or ZWJ (GB9). */ |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
75 else if (prop == GBP_EXTEND || prop == GBP_ZWJ) |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
76 /* *p = 0 */; |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
77 /* No break before SpacingMarks (GB9a). */ |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
78 else if (prop == GBP_SPACINGMARK) |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
79 /* *p = 0 */; |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
80 /* No break after Prepend characters (GB9b). */ |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
81 else if (last_char_prop == GBP_PREPEND) |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
82 /* *p = 0 */; |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
83 /* No break within emoji modifier sequences (GB10). */ |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
84 else if ((last_compchar_prop == GBP_EB |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
85 || last_compchar_prop == GBP_EBG) |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
86 && prop == GBP_EM) |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
87 /* *p = 0 */; |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
88 /* No break within emoji zwj sequences (GB11). */ |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
89 else if (last_char_prop == GBP_ZWJ |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
90 && (prop == GBP_GAZ |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
91 || prop == GBP_EBG)) |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
92 /* *p = 0 */; |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
93 /* No break between RI if there is an odd number of RI |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
94 characters before (GB12, GB13). */ |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
95 else if (prop == GBP_RI) |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
96 { |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
97 if (ri_count % 2 == 0) |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
98 *p = 1; |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
99 /* else *p = 0; */ |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
100 } |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
101 /* Break everywhere (GBP999). */ |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
102 else |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
103 *p = 1; |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
104 } |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
105 |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
106 last_char_prop = prop; |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
107 |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
108 if (!(prop == GBP_EXTEND |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
109 && (last_compchar_prop == GBP_EB |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
110 || last_compchar_prop == GBP_EBG))) |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
111 last_compchar_prop = prop; |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
112 |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
113 if (prop == GBP_RI) |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
114 ri_count++; |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
115 else |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
116 ri_count = 0; |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
117 |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
118 s += count; |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
119 p += count; |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
120 } |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
121 } |
4c0b38aac75f
libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff
changeset
|
122 } |