Mercurial > gnulib
comparison lib/uniwbrk/u-wordbreaks.h @ 19273:638b6d1fdf36 ueno/unicode-9.0.0
libunistring: update to Unicode 9.0.0
* lib/gen-uni-tables.c (fill_properties): Recognize Sentence_Terminal
and Prepended_Concatenation_Mark.
(is_property_default_ignorable_code_point): Exclude U+08E2.
(fill_arabicshaping): Allow missing whitespace when parsing;
recognize "AFRICAN FEH", "AFRICAN QAF", and "AFRICAN MOON".
(output_blocks): Increase the element size of the level1 table to
accommodate more blocks.
(get_lbp): Recognize ZWJ, E_Base, and E_Modifier characters;
Update each class according to the standard.
(get_wbp): Recognize ZWJ, E_Base, E_Modifier, Glue_After_Zwj, and
E_Base_GAZ characters.
(output_gbp_table): Recognize ZWJ, E_Base, E_Modifier, Glue_After_Zwj,
and E_Base_GAZ characters.
* lib/unictype.in.h (UC_JOINING_GROUP_AFRICAN_FEH,
UC_JOINING_GROUP_AFRICAN_QAF, UC_JOINING_GROUP_AFRICAN_MOON): New enum value.
* lib/unilbrk/lbrktables.h (LBP_ZWJ, LBP_EB, LBP_EM): New enum value.
* lib/unilbrk/lbrktables.c (unilbrk_table): Extend the table with
LBP_ZWJ, LBP_EB, and LBP_EM.
* lib/uniwbrk.in.h (WBP_ZWJ, WBP_EB, WBP_EM, WBP_GAZ, WBP_EBG): New
enum value.
* lib/uniwbrk/u-wordbreaks.h: Implement WB3c, WB15, and WB16.
* lib/uniwbrk/wbrktable.h (uniwbrk_prop_index): New variable declaration.
* lib/uniwbrk/wbrktable.c (uniwbrk_prop_index): New variable.
(uniwbrk_table): Implement WB14.
* tests/uniwbrk/test-uc-wordbreaks.c (wordbreakproperty_to_string):
Check WBP_ZWJ, WBP_EB, WBP_EM, WBP_GAZ, and WBP_EBG.
* modules/unigbrk/u{32,16,8}-grapheme-breaks: No longer depend on
uc-is-grapheme-break.
* modules/unigbrk/uc-grapheme-breaks: New module.
* modules/unigbrk/uc-grapheme-breaks-tests: New module.
* lib/unigbrk.in.h (GBP_ZWJ, GBP_EB, GBP_EM, GBP_GAZ, GBP_EBG): New
enum value.
(uc_grapheme_breaks): New function, replacing uc_is_grapheme_break.
* lib/unigbrk/u-grapheme-breaks.h: New file.
* lib/unigbrk/u{32,16,8}-grapheme-breaks.c: Rewrite using
u-grapheme-breaks.h instead of uc_is_grapheme_break.
* lib/unigbrk/uc-grapheme-breaks.c: New file.
* lib/unigbrk/uc-is-grapheme-break.c: Partially update to TR29 rev 29.
* tests/unigbrk/test-uc-gbrk-prop.c (graphemebreakproperty_to_string):
Check GBP_ZWJ, GBP_EB, GBP_EM, GBP_GAZ, and GBP_EBG.
* tests/unigbrk/test-uc-grapheme-breaks.c: New test.
* tests/unigbrk/test-uc-is-grapheme-break.c (graphemebreakproperty_to_string):
Check GBP_ZWJ, GBP_EB, GBP_EM, GBP_GAZ, and GBP_EBG.
(main): Skip unsupported rules involving 3 or more characters, namely
GB10, GB12, and GB13.
* lib/uniwidth/width.c (nonspacing_table_data): Update.
author | Daiki Ueno <ueno@gnu.org> |
---|---|
date | Wed, 12 Oct 2016 17:40:37 +0200 |
parents | 9759915b2aca |
children | 10eb9086bea0 |
comparison
equal
deleted
inserted
replaced
19272:c20fd8143023 | 19273:638b6d1fdf36 |
---|---|
37 /* For recognizing rules involving 3 complex characters: | 37 /* For recognizing rules involving 3 complex characters: |
38 Word break property of the second-to-last complex character. | 38 Word break property of the second-to-last complex character. |
39 -1 at the very beginning of the string. */ | 39 -1 at the very beginning of the string. */ |
40 int secondlast_compchar_prop = -1; | 40 int secondlast_compchar_prop = -1; |
41 | 41 |
42 size_t ri_count = 0; | |
43 | |
42 /* Don't break inside multibyte characters. */ | 44 /* Don't break inside multibyte characters. */ |
43 memset (p, 0, n); | 45 memset (p, 0, n); |
44 | 46 |
45 while (s < s_end) | 47 while (s < s_end) |
46 { | 48 { |
49 int prop = uc_wordbreak_property (uc); | 51 int prop = uc_wordbreak_property (uc); |
50 | 52 |
51 /* No break at the start of the string. */ | 53 /* No break at the start of the string. */ |
52 if (last_char_prop >= 0) | 54 if (last_char_prop >= 0) |
53 { | 55 { |
54 /* No break between CR and LF. */ | 56 /* No break between CR and LF (WB3). */ |
55 if (last_char_prop == WBP_CR && prop == WBP_LF) | 57 if (last_char_prop == WBP_CR && prop == WBP_LF) |
56 /* *p = 0 */; | 58 /* *p = 0 */; |
57 /* Break before and after newlines. */ | 59 /* Break before and after newlines (WB3a, WB3b). */ |
58 else if ((last_char_prop == WBP_CR | 60 else if ((last_char_prop == WBP_CR |
59 || last_char_prop == WBP_LF | 61 || last_char_prop == WBP_LF |
60 || last_char_prop == WBP_NEWLINE) | 62 || last_char_prop == WBP_NEWLINE) |
61 || (prop == WBP_CR | 63 || (prop == WBP_CR |
62 || prop == WBP_LF | 64 || prop == WBP_LF |
63 || prop == WBP_NEWLINE)) | 65 || prop == WBP_NEWLINE)) |
64 *p = 1; | 66 *p = 1; |
67 /* No break within emoji zwj sequence (WB3c). */ | |
68 else if (last_char_prop == WBP_ZWJ && | |
69 (prop == WBP_GAZ || prop == WBP_EBG)) | |
70 /* *p = 0 */; | |
65 /* Ignore Format and Extend characters. */ | 71 /* Ignore Format and Extend characters. */ |
66 else if (!(prop == WBP_EXTEND || prop == WBP_FORMAT)) | 72 else if (!(prop == WBP_EXTEND || prop == WBP_FORMAT || prop == WBP_ZWJ)) |
67 { | 73 { |
68 /* No break in these situations (see UAX #29): | 74 /* No break in these situations (see UAX #29): |
69 | 75 |
70 secondlast last current | 76 secondlast last current |
71 | 77 |
73 (ALetter | HL) × (MidLetter | MidNumLet | SQ) (ALetter | HL) (WB6) | 79 (ALetter | HL) × (MidLetter | MidNumLet | SQ) (ALetter | HL) (WB6) |
74 Numeric (MidNum | MidNumLet | SQ) × Numeric (WB11) | 80 Numeric (MidNum | MidNumLet | SQ) × Numeric (WB11) |
75 Numeric × (MidNum | MidNumLet | SQ) Numeric (WB12) | 81 Numeric × (MidNum | MidNumLet | SQ) Numeric (WB12) |
76 HL × DQ HL (WB7b) | 82 HL × DQ HL (WB7b) |
77 HL DQ × HL (WB7c) | 83 HL DQ × HL (WB7c) |
78 (ALetter | HL) × (ALetter | HL) (WB5) | 84 ^ (RI RI)* RI × RI (WB15) |
79 (ALetter | HL) × Numeric (WB9) | 85 [^RI] (RI RI)* RI × RI (WB16) |
80 Numeric × (ALetter | HL) (WB10) | |
81 Numeric × Numeric (WB8) | |
82 HL × SQ (WB7a) | |
83 Katakana × Katakana (WB13) | |
84 (ALetter | HL | Numeric | Katakana) × ExtendNumLet (WB13a) | |
85 ExtendNumLet × ExtendNumLet (WB13a) | |
86 ExtendNumLet × (ALetter | HL | Numeric | Katakana) (WB13b) | |
87 Regional_Indicator × Regional_Indicator (WB13c) | |
88 */ | 86 */ |
89 /* No break across certain punctuation. Also, disable word | 87 /* No break across certain punctuation. Also, disable word |
90 breaks that were recognized earlier (due to lookahead of | 88 breaks that were recognized earlier (due to lookahead of |
91 only one complex character). */ | 89 only one complex character). */ |
92 if (((prop == WBP_ALETTER | 90 if (((prop == WBP_ALETTER |
106 && secondlast_compchar_prop == WBP_HL)) | 104 && secondlast_compchar_prop == WBP_HL)) |
107 { | 105 { |
108 *last_compchar_ptr = 0; | 106 *last_compchar_ptr = 0; |
109 /* *p = 0; */ | 107 /* *p = 0; */ |
110 } | 108 } |
111 /* Break after Format and Extend characters. */ | 109 /* Break before RI, if odd number of RI's are |
110 preceding (WB15, WB16). */ | |
111 else if (last_compchar_prop == WBP_RI && prop == WBP_RI) | |
112 { | |
113 if (ri_count % 2 == 0) | |
114 *p = 1; | |
115 /* else *p = 0 */ | |
116 } | |
117 /* Break after Format and Extend character. */ | |
112 else if (last_compchar_prop == WBP_EXTEND | 118 else if (last_compchar_prop == WBP_EXTEND |
113 || last_compchar_prop == WBP_FORMAT) | 119 || last_compchar_prop == WBP_FORMAT) |
114 *p = 1; | 120 *p = 1; |
115 else | 121 else |
116 { | 122 { |
117 /* Normalize property value to table index, | 123 int last_compchar_index = |
118 skipping 5 properties: WBP_EXTEND, | 124 uniwbrk_prop_index[last_compchar_prop]; |
119 WBP_FORMAT, WBP_NEWLINE, WBP_CR, and | 125 int index = uniwbrk_prop_index[prop]; |
120 WBP_LF. */ | |
121 int last_compchar_prop_index = last_compchar_prop; | |
122 int prop_index = prop; | |
123 | 126 |
124 if (last_compchar_prop_index >= WBP_EXTEND) | 127 /* Break between unknown pair (WB999). */ |
125 last_compchar_prop_index -= 5; | 128 if (last_compchar_index < 0 || index < 0) |
126 | 129 *p = 1; |
127 if (prop_index >= WBP_EXTEND) | |
128 prop_index -= 5; | |
129 | |
130 /* Perform a single table lookup. */ | 130 /* Perform a single table lookup. */ |
131 if (uniwbrk_table[last_compchar_prop_index][prop_index]) | 131 else if (uniwbrk_table[last_compchar_index][index]) |
132 *p = 1; | 132 *p = 1; |
133 /* else *p = 0; */ | 133 /* else *p = 0; */ |
134 } | 134 } |
135 } | 135 } |
136 } | 136 } |
137 | 137 |
138 last_char_prop = prop; | 138 last_char_prop = prop; |
139 /* Ignore Format and Extend characters, except at the start | 139 |
140 of the line. */ | 140 /* Ignore Format and Extend characters, except at the |
141 start of the line. */ | |
141 if (last_compchar_prop < 0 | 142 if (last_compchar_prop < 0 |
142 || last_compchar_prop == WBP_CR | 143 || last_compchar_prop == WBP_CR |
143 || last_compchar_prop == WBP_LF | 144 || last_compchar_prop == WBP_LF |
144 || last_compchar_prop == WBP_NEWLINE | 145 || last_compchar_prop == WBP_NEWLINE |
145 || !(prop == WBP_EXTEND || prop == WBP_FORMAT)) | 146 || !(prop == WBP_EXTEND || prop == WBP_FORMAT || prop == WBP_ZWJ)) |
146 { | 147 { |
147 secondlast_compchar_prop = last_compchar_prop; | 148 secondlast_compchar_prop = last_compchar_prop; |
148 last_compchar_prop = prop; | 149 last_compchar_prop = prop; |
149 last_compchar_ptr = p; | 150 last_compchar_ptr = p; |
151 | |
152 if (prop == WBP_RI) | |
153 ri_count++; | |
154 else | |
155 ri_count = 0; | |
150 } | 156 } |
151 | 157 |
152 s += count; | 158 s += count; |
153 p += count; | 159 p += count; |
154 } | 160 } |