Mercurial > gnulib
diff lib/uniwbrk/u-wordbreaks.h @ 19273:638b6d1fdf36 ueno/unicode-9.0.0
libunistring: update to Unicode 9.0.0
* lib/gen-uni-tables.c (fill_properties): Recognize Sentence_Terminal
and Prepended_Concatenation_Mark.
(is_property_default_ignorable_code_point): Exclude U+08E2.
(fill_arabicshaping): Allow missing whitespace when parsing;
recognize "AFRICAN FEH", "AFRICAN QAF", and "AFRICAN MOON".
(output_blocks): Increase the element size of the level1 table to
accommodate more blocks.
(get_lbp): Recognize ZWJ, E_Base, and E_Modifier characters;
Update each class according to the standard.
(get_wbp): Recognize ZWJ, E_Base, E_Modifier, Glue_After_Zwj, and
E_Base_GAZ characters.
(output_gbp_table): Recognize ZWJ, E_Base, E_Modifier, Glue_After_Zwj,
and E_Base_GAZ characters.
* lib/unictype.in.h (UC_JOINING_GROUP_AFRICAN_FEH,
UC_JOINING_GROUP_AFRICAN_QAF, UC_JOINING_GROUP_AFRICAN_MOON): New enum value.
* lib/unilbrk/lbrktables.h (LBP_ZWJ, LBP_EB, LBP_EM): New enum value.
* lib/unilbrk/lbrktables.c (unilbrk_table): Extend the table with
LBP_ZWJ, LBP_EB, and LBP_EM.
* lib/uniwbrk.in.h (WBP_ZWJ, WBP_EB, WBP_EM, WBP_GAZ, WBP_EBG): New
enum value.
* lib/uniwbrk/u-wordbreaks.h: Implement WB3c, WB15, and WB16.
* lib/uniwbrk/wbrktable.h (uniwbrk_prop_index): New variable declaration.
* lib/uniwbrk/wbrktable.c (uniwbrk_prop_index): New variable.
(uniwbrk_table): Implement WB14.
* tests/uniwbrk/test-uc-wordbreaks.c (wordbreakproperty_to_string):
Check WBP_ZWJ, WBP_EB, WBP_EM, WBP_GAZ, and WBP_EBG.
* modules/unigbrk/u{32,16,8}-grapheme-breaks: No longer depend on
uc-is-grapheme-break.
* modules/unigbrk/uc-grapheme-breaks: New module.
* modules/unigbrk/uc-grapheme-breaks-tests: New module.
* lib/unigbrk.in.h (GBP_ZWJ, GBP_EB, GBP_EM, GBP_GAZ, GBP_EBG): New
enum value.
(uc_grapheme_breaks): New function, replacing uc_is_grapheme_break.
* lib/unigbrk/u-grapheme-breaks.h: New file.
* lib/unigbrk/u{32,16,8}-grapheme-breaks.c: Rewrite using
u-grapheme-breaks.h instead of uc_is_grapheme_break.
* lib/unigbrk/uc-grapheme-breaks.c: New file.
* lib/unigbrk/uc-is-grapheme-break.c: Partially update to TR29 rev 29.
* tests/unigbrk/test-uc-gbrk-prop.c (graphemebreakproperty_to_string):
Check GBP_ZWJ, GBP_EB, GBP_EM, GBP_GAZ, and GBP_EBG.
* tests/unigbrk/test-uc-grapheme-breaks.c: New test.
* tests/unigbrk/test-uc-is-grapheme-break.c (graphemebreakproperty_to_string):
Check GBP_ZWJ, GBP_EB, GBP_EM, GBP_GAZ, and GBP_EBG.
(main): Skip unsupported rules involving 3 or more characters, namely
GB10, GB12, and GB13.
* lib/uniwidth/width.c (nonspacing_table_data): Update.
author | Daiki Ueno <ueno@gnu.org> |
---|---|
date | Wed, 12 Oct 2016 17:40:37 +0200 |
parents | 9759915b2aca |
children | 10eb9086bea0 |
line wrap: on
line diff
--- a/lib/uniwbrk/u-wordbreaks.h Sun Oct 29 16:22:41 2017 -0700 +++ b/lib/uniwbrk/u-wordbreaks.h Wed Oct 12 17:40:37 2016 +0200 @@ -39,6 +39,8 @@ -1 at the very beginning of the string. */ int secondlast_compchar_prop = -1; + size_t ri_count = 0; + /* Don't break inside multibyte characters. */ memset (p, 0, n); @@ -51,10 +53,10 @@ /* No break at the start of the string. */ if (last_char_prop >= 0) { - /* No break between CR and LF. */ + /* No break between CR and LF (WB3). */ if (last_char_prop == WBP_CR && prop == WBP_LF) /* *p = 0 */; - /* Break before and after newlines. */ + /* Break before and after newlines (WB3a, WB3b). */ else if ((last_char_prop == WBP_CR || last_char_prop == WBP_LF || last_char_prop == WBP_NEWLINE) @@ -62,8 +64,12 @@ || prop == WBP_LF || prop == WBP_NEWLINE)) *p = 1; + /* No break within emoji zwj sequence (WB3c). */ + else if (last_char_prop == WBP_ZWJ && + (prop == WBP_GAZ || prop == WBP_EBG)) + /* *p = 0 */; /* Ignore Format and Extend characters. */ - else if (!(prop == WBP_EXTEND || prop == WBP_FORMAT)) + else if (!(prop == WBP_EXTEND || prop == WBP_FORMAT || prop == WBP_ZWJ)) { /* No break in these situations (see UAX #29): @@ -75,16 +81,8 @@ Numeric × (MidNum | MidNumLet | SQ) Numeric (WB12) HL × DQ HL (WB7b) HL DQ × HL (WB7c) - (ALetter | HL) × (ALetter | HL) (WB5) - (ALetter | HL) × Numeric (WB9) - Numeric × (ALetter | HL) (WB10) - Numeric × Numeric (WB8) - HL × SQ (WB7a) - Katakana × Katakana (WB13) - (ALetter | HL | Numeric | Katakana) × ExtendNumLet (WB13a) - ExtendNumLet × ExtendNumLet (WB13a) - ExtendNumLet × (ALetter | HL | Numeric | Katakana) (WB13b) - Regional_Indicator × Regional_Indicator (WB13c) + ^ (RI RI)* RI × RI (WB15) + [^RI] (RI RI)* RI × RI (WB16) */ /* No break across certain punctuation. Also, disable word breaks that were recognized earlier (due to lookahead of @@ -108,27 +106,29 @@ *last_compchar_ptr = 0; /* *p = 0; */ } - /* Break after Format and Extend characters. */ + /* Break before RI, if odd number of RI's are + preceding (WB15, WB16). */ + else if (last_compchar_prop == WBP_RI && prop == WBP_RI) + { + if (ri_count % 2 == 0) + *p = 1; + /* else *p = 0 */ + } + /* Break after Format and Extend character. */ else if (last_compchar_prop == WBP_EXTEND || last_compchar_prop == WBP_FORMAT) *p = 1; else { - /* Normalize property value to table index, - skipping 5 properties: WBP_EXTEND, - WBP_FORMAT, WBP_NEWLINE, WBP_CR, and - WBP_LF. */ - int last_compchar_prop_index = last_compchar_prop; - int prop_index = prop; + int last_compchar_index = + uniwbrk_prop_index[last_compchar_prop]; + int index = uniwbrk_prop_index[prop]; - if (last_compchar_prop_index >= WBP_EXTEND) - last_compchar_prop_index -= 5; - - if (prop_index >= WBP_EXTEND) - prop_index -= 5; - + /* Break between unknown pair (WB999). */ + if (last_compchar_index < 0 || index < 0) + *p = 1; /* Perform a single table lookup. */ - if (uniwbrk_table[last_compchar_prop_index][prop_index]) + else if (uniwbrk_table[last_compchar_index][index]) *p = 1; /* else *p = 0; */ } @@ -136,17 +136,23 @@ } last_char_prop = prop; - /* Ignore Format and Extend characters, except at the start - of the line. */ + + /* Ignore Format and Extend characters, except at the + start of the line. */ if (last_compchar_prop < 0 || last_compchar_prop == WBP_CR || last_compchar_prop == WBP_LF || last_compchar_prop == WBP_NEWLINE - || !(prop == WBP_EXTEND || prop == WBP_FORMAT)) + || !(prop == WBP_EXTEND || prop == WBP_FORMAT || prop == WBP_ZWJ)) { secondlast_compchar_prop = last_compchar_prop; last_compchar_prop = prop; last_compchar_ptr = p; + + if (prop == WBP_RI) + ri_count++; + else + ri_count = 0; } s += count;