comparison lib/uniwbrk/u-wordbreaks.h @ 19273:638b6d1fdf36 ueno/unicode-9.0.0

libunistring: update to Unicode 9.0.0 * lib/gen-uni-tables.c (fill_properties): Recognize Sentence_Terminal and Prepended_Concatenation_Mark. (is_property_default_ignorable_code_point): Exclude U+08E2. (fill_arabicshaping): Allow missing whitespace when parsing; recognize "AFRICAN FEH", "AFRICAN QAF", and "AFRICAN MOON". (output_blocks): Increase the element size of the level1 table to accommodate more blocks. (get_lbp): Recognize ZWJ, E_Base, and E_Modifier characters; Update each class according to the standard. (get_wbp): Recognize ZWJ, E_Base, E_Modifier, Glue_After_Zwj, and E_Base_GAZ characters. (output_gbp_table): Recognize ZWJ, E_Base, E_Modifier, Glue_After_Zwj, and E_Base_GAZ characters. * lib/unictype.in.h (UC_JOINING_GROUP_AFRICAN_FEH, UC_JOINING_GROUP_AFRICAN_QAF, UC_JOINING_GROUP_AFRICAN_MOON): New enum value. * lib/unilbrk/lbrktables.h (LBP_ZWJ, LBP_EB, LBP_EM): New enum value. * lib/unilbrk/lbrktables.c (unilbrk_table): Extend the table with LBP_ZWJ, LBP_EB, and LBP_EM. * lib/uniwbrk.in.h (WBP_ZWJ, WBP_EB, WBP_EM, WBP_GAZ, WBP_EBG): New enum value. * lib/uniwbrk/u-wordbreaks.h: Implement WB3c, WB15, and WB16. * lib/uniwbrk/wbrktable.h (uniwbrk_prop_index): New variable declaration. * lib/uniwbrk/wbrktable.c (uniwbrk_prop_index): New variable. (uniwbrk_table): Implement WB14. * tests/uniwbrk/test-uc-wordbreaks.c (wordbreakproperty_to_string): Check WBP_ZWJ, WBP_EB, WBP_EM, WBP_GAZ, and WBP_EBG. * modules/unigbrk/u{32,16,8}-grapheme-breaks: No longer depend on uc-is-grapheme-break. * modules/unigbrk/uc-grapheme-breaks: New module. * modules/unigbrk/uc-grapheme-breaks-tests: New module. * lib/unigbrk.in.h (GBP_ZWJ, GBP_EB, GBP_EM, GBP_GAZ, GBP_EBG): New enum value. (uc_grapheme_breaks): New function, replacing uc_is_grapheme_break. * lib/unigbrk/u-grapheme-breaks.h: New file. * lib/unigbrk/u{32,16,8}-grapheme-breaks.c: Rewrite using u-grapheme-breaks.h instead of uc_is_grapheme_break. * lib/unigbrk/uc-grapheme-breaks.c: New file. * lib/unigbrk/uc-is-grapheme-break.c: Partially update to TR29 rev 29. * tests/unigbrk/test-uc-gbrk-prop.c (graphemebreakproperty_to_string): Check GBP_ZWJ, GBP_EB, GBP_EM, GBP_GAZ, and GBP_EBG. * tests/unigbrk/test-uc-grapheme-breaks.c: New test. * tests/unigbrk/test-uc-is-grapheme-break.c (graphemebreakproperty_to_string): Check GBP_ZWJ, GBP_EB, GBP_EM, GBP_GAZ, and GBP_EBG. (main): Skip unsupported rules involving 3 or more characters, namely GB10, GB12, and GB13. * lib/uniwidth/width.c (nonspacing_table_data): Update.
author Daiki Ueno <ueno@gnu.org>
date Wed, 12 Oct 2016 17:40:37 +0200
parents 9759915b2aca
children 10eb9086bea0
comparison
equal deleted inserted replaced
19272:c20fd8143023 19273:638b6d1fdf36
37 /* For recognizing rules involving 3 complex characters: 37 /* For recognizing rules involving 3 complex characters:
38 Word break property of the second-to-last complex character. 38 Word break property of the second-to-last complex character.
39 -1 at the very beginning of the string. */ 39 -1 at the very beginning of the string. */
40 int secondlast_compchar_prop = -1; 40 int secondlast_compchar_prop = -1;
41 41
42 size_t ri_count = 0;
43
42 /* Don't break inside multibyte characters. */ 44 /* Don't break inside multibyte characters. */
43 memset (p, 0, n); 45 memset (p, 0, n);
44 46
45 while (s < s_end) 47 while (s < s_end)
46 { 48 {
49 int prop = uc_wordbreak_property (uc); 51 int prop = uc_wordbreak_property (uc);
50 52
51 /* No break at the start of the string. */ 53 /* No break at the start of the string. */
52 if (last_char_prop >= 0) 54 if (last_char_prop >= 0)
53 { 55 {
54 /* No break between CR and LF. */ 56 /* No break between CR and LF (WB3). */
55 if (last_char_prop == WBP_CR && prop == WBP_LF) 57 if (last_char_prop == WBP_CR && prop == WBP_LF)
56 /* *p = 0 */; 58 /* *p = 0 */;
57 /* Break before and after newlines. */ 59 /* Break before and after newlines (WB3a, WB3b). */
58 else if ((last_char_prop == WBP_CR 60 else if ((last_char_prop == WBP_CR
59 || last_char_prop == WBP_LF 61 || last_char_prop == WBP_LF
60 || last_char_prop == WBP_NEWLINE) 62 || last_char_prop == WBP_NEWLINE)
61 || (prop == WBP_CR 63 || (prop == WBP_CR
62 || prop == WBP_LF 64 || prop == WBP_LF
63 || prop == WBP_NEWLINE)) 65 || prop == WBP_NEWLINE))
64 *p = 1; 66 *p = 1;
67 /* No break within emoji zwj sequence (WB3c). */
68 else if (last_char_prop == WBP_ZWJ &&
69 (prop == WBP_GAZ || prop == WBP_EBG))
70 /* *p = 0 */;
65 /* Ignore Format and Extend characters. */ 71 /* Ignore Format and Extend characters. */
66 else if (!(prop == WBP_EXTEND || prop == WBP_FORMAT)) 72 else if (!(prop == WBP_EXTEND || prop == WBP_FORMAT || prop == WBP_ZWJ))
67 { 73 {
68 /* No break in these situations (see UAX #29): 74 /* No break in these situations (see UAX #29):
69 75
70 secondlast last current 76 secondlast last current
71 77
73 (ALetter | HL) × (MidLetter | MidNumLet | SQ) (ALetter | HL) (WB6) 79 (ALetter | HL) × (MidLetter | MidNumLet | SQ) (ALetter | HL) (WB6)
74 Numeric (MidNum | MidNumLet | SQ) × Numeric (WB11) 80 Numeric (MidNum | MidNumLet | SQ) × Numeric (WB11)
75 Numeric × (MidNum | MidNumLet | SQ) Numeric (WB12) 81 Numeric × (MidNum | MidNumLet | SQ) Numeric (WB12)
76 HL × DQ HL (WB7b) 82 HL × DQ HL (WB7b)
77 HL DQ × HL (WB7c) 83 HL DQ × HL (WB7c)
78 (ALetter | HL) × (ALetter | HL) (WB5) 84 ^ (RI RI)* RI × RI (WB15)
79 (ALetter | HL) × Numeric (WB9) 85 [^RI] (RI RI)* RI × RI (WB16)
80 Numeric × (ALetter | HL) (WB10)
81 Numeric × Numeric (WB8)
82 HL × SQ (WB7a)
83 Katakana × Katakana (WB13)
84 (ALetter | HL | Numeric | Katakana) × ExtendNumLet (WB13a)
85 ExtendNumLet × ExtendNumLet (WB13a)
86 ExtendNumLet × (ALetter | HL | Numeric | Katakana) (WB13b)
87 Regional_Indicator × Regional_Indicator (WB13c)
88 */ 86 */
89 /* No break across certain punctuation. Also, disable word 87 /* No break across certain punctuation. Also, disable word
90 breaks that were recognized earlier (due to lookahead of 88 breaks that were recognized earlier (due to lookahead of
91 only one complex character). */ 89 only one complex character). */
92 if (((prop == WBP_ALETTER 90 if (((prop == WBP_ALETTER
106 && secondlast_compchar_prop == WBP_HL)) 104 && secondlast_compchar_prop == WBP_HL))
107 { 105 {
108 *last_compchar_ptr = 0; 106 *last_compchar_ptr = 0;
109 /* *p = 0; */ 107 /* *p = 0; */
110 } 108 }
111 /* Break after Format and Extend characters. */ 109 /* Break before RI, if odd number of RI's are
110 preceding (WB15, WB16). */
111 else if (last_compchar_prop == WBP_RI && prop == WBP_RI)
112 {
113 if (ri_count % 2 == 0)
114 *p = 1;
115 /* else *p = 0 */
116 }
117 /* Break after Format and Extend character. */
112 else if (last_compchar_prop == WBP_EXTEND 118 else if (last_compchar_prop == WBP_EXTEND
113 || last_compchar_prop == WBP_FORMAT) 119 || last_compchar_prop == WBP_FORMAT)
114 *p = 1; 120 *p = 1;
115 else 121 else
116 { 122 {
117 /* Normalize property value to table index, 123 int last_compchar_index =
118 skipping 5 properties: WBP_EXTEND, 124 uniwbrk_prop_index[last_compchar_prop];
119 WBP_FORMAT, WBP_NEWLINE, WBP_CR, and 125 int index = uniwbrk_prop_index[prop];
120 WBP_LF. */
121 int last_compchar_prop_index = last_compchar_prop;
122 int prop_index = prop;
123 126
124 if (last_compchar_prop_index >= WBP_EXTEND) 127 /* Break between unknown pair (WB999). */
125 last_compchar_prop_index -= 5; 128 if (last_compchar_index < 0 || index < 0)
126 129 *p = 1;
127 if (prop_index >= WBP_EXTEND)
128 prop_index -= 5;
129
130 /* Perform a single table lookup. */ 130 /* Perform a single table lookup. */
131 if (uniwbrk_table[last_compchar_prop_index][prop_index]) 131 else if (uniwbrk_table[last_compchar_index][index])
132 *p = 1; 132 *p = 1;
133 /* else *p = 0; */ 133 /* else *p = 0; */
134 } 134 }
135 } 135 }
136 } 136 }
137 137
138 last_char_prop = prop; 138 last_char_prop = prop;
139 /* Ignore Format and Extend characters, except at the start 139
140 of the line. */ 140 /* Ignore Format and Extend characters, except at the
141 start of the line. */
141 if (last_compchar_prop < 0 142 if (last_compchar_prop < 0
142 || last_compchar_prop == WBP_CR 143 || last_compchar_prop == WBP_CR
143 || last_compchar_prop == WBP_LF 144 || last_compchar_prop == WBP_LF
144 || last_compchar_prop == WBP_NEWLINE 145 || last_compchar_prop == WBP_NEWLINE
145 || !(prop == WBP_EXTEND || prop == WBP_FORMAT)) 146 || !(prop == WBP_EXTEND || prop == WBP_FORMAT || prop == WBP_ZWJ))
146 { 147 {
147 secondlast_compchar_prop = last_compchar_prop; 148 secondlast_compchar_prop = last_compchar_prop;
148 last_compchar_prop = prop; 149 last_compchar_prop = prop;
149 last_compchar_ptr = p; 150 last_compchar_ptr = p;
151
152 if (prop == WBP_RI)
153 ri_count++;
154 else
155 ri_count = 0;
150 } 156 }
151 157
152 s += count; 158 s += count;
153 p += count; 159 p += count;
154 } 160 }