annotate lib/unigbrk/u-grapheme-breaks.h @ 40057:b06060465f09

maint: Run 'make update-copyright'
author Paul Eggert <eggert@cs.ucla.edu>
date Tue, 01 Jan 2019 00:25:11 +0100
parents 10eb9086bea0
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
19452
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
1 /* Grapheme cluster break function.
40057
b06060465f09 maint: Run 'make update-copyright'
Paul Eggert <eggert@cs.ucla.edu>
parents: 19484
diff changeset
2 Copyright (C) 2010-2019 Free Software Foundation, Inc.
19452
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
3 Written by Ben Pfaff <blp@cs.stanford.edu>, 2010.
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
4
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
5 This program is free software: you can redistribute it and/or modify it
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
6 under the terms of the GNU Lesser General Public License as published
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
7 by the Free Software Foundation; either version 3 of the License, or
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
8 (at your option) any later version.
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
9
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
10 This program is distributed in the hope that it will be useful,
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
13 Lesser General Public License for more details.
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
14
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
15 You should have received a copy of the GNU Lesser General Public License
19461
c1cbd8206d4b all: Replace more http URLs by https URLs.
Bruno Haible <bruno@clisp.org>
parents: 19452
diff changeset
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
19452
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
17
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
18 void
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
19 FUNC (const UNIT *s, size_t n, char *p)
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
20 {
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
21 if (n > 0)
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
22 {
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
23 const UNIT *s_end = s + n;
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
24
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
25 /* Grapheme Cluster break property of the last character.
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
26 -1 at the very beginning of the string. */
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
27 int last_char_prop = -1;
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
28
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
29 /* Grapheme Cluster break property of the last complex character.
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
30 -1 at the very beginning of the string. */
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
31 int last_compchar_prop = -1;
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
32
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
33 size_t ri_count = 0;
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
34
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
35 /* Don't break inside multibyte characters. */
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
36 memset (p, 0, n);
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
37
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
38 while (s < s_end)
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
39 {
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
40 ucs4_t uc;
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
41 int count = U_MBTOUC (&uc, s, s_end - s);
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
42 int prop = uc_graphemeclusterbreak_property (uc);
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
43
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
44 /* Break at the start of the string (GB1). */
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
45 if (last_char_prop < 0)
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
46 *p = 1;
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
47 else
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
48 {
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
49 /* No break between CR and LF (GB3). */
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
50 if (last_char_prop == GBP_CR && prop == GBP_LF)
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
51 /* *p = 0 */;
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
52 /* Break before and after newlines (GB4, GB5). */
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
53 else if ((last_char_prop == GBP_CR
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
54 || last_char_prop == GBP_LF
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
55 || last_char_prop == GBP_CONTROL)
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
56 || (prop == GBP_CR
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
57 || prop == GBP_LF
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
58 || prop == GBP_CONTROL))
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
59 *p = 1;
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
60 /* No break between Hangul syllable sequences (GB6, GB7, GB8). */
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
61 else if ((last_char_prop == GBP_L
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
62 && (prop == GBP_L
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
63 || prop == GBP_V
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
64 || prop == GBP_LV
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
65 || prop == GBP_LVT))
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
66 || ((last_char_prop == GBP_LV
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
67 || last_char_prop == GBP_V)
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
68 && (prop == GBP_V
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
69 || prop == GBP_T))
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
70 || ((last_char_prop == GBP_LVT
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
71 || last_char_prop == GBP_T)
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
72 && prop == GBP_T))
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
73 /* *p = 0 */;
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
74 /* No break before extending characters or ZWJ (GB9). */
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
75 else if (prop == GBP_EXTEND || prop == GBP_ZWJ)
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
76 /* *p = 0 */;
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
77 /* No break before SpacingMarks (GB9a). */
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
78 else if (prop == GBP_SPACINGMARK)
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
79 /* *p = 0 */;
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
80 /* No break after Prepend characters (GB9b). */
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
81 else if (last_char_prop == GBP_PREPEND)
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
82 /* *p = 0 */;
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
83 /* No break within emoji modifier sequences (GB10). */
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
84 else if ((last_compchar_prop == GBP_EB
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
85 || last_compchar_prop == GBP_EBG)
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
86 && prop == GBP_EM)
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
87 /* *p = 0 */;
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
88 /* No break within emoji zwj sequences (GB11). */
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
89 else if (last_char_prop == GBP_ZWJ
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
90 && (prop == GBP_GAZ
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
91 || prop == GBP_EBG))
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
92 /* *p = 0 */;
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
93 /* No break between RI if there is an odd number of RI
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
94 characters before (GB12, GB13). */
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
95 else if (prop == GBP_RI)
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
96 {
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
97 if (ri_count % 2 == 0)
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
98 *p = 1;
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
99 /* else *p = 0; */
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
100 }
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
101 /* Break everywhere (GBP999). */
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
102 else
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
103 *p = 1;
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
104 }
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
105
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
106 last_char_prop = prop;
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
107
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
108 if (!(prop == GBP_EXTEND
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
109 && (last_compchar_prop == GBP_EB
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
110 || last_compchar_prop == GBP_EBG)))
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
111 last_compchar_prop = prop;
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
112
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
113 if (prop == GBP_RI)
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
114 ri_count++;
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
115 else
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
116 ri_count = 0;
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
117
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
118 s += count;
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
119 p += count;
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
120 }
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
121 }
4c0b38aac75f libunistring: update to Unicode 9.0.0
Daiki Ueno <ueno@gnu.org>
parents:
diff changeset
122 }