diff lib/uniwbrk/u-wordbreaks.h @ 19273:638b6d1fdf36 ueno/unicode-9.0.0

libunistring: update to Unicode 9.0.0 * lib/gen-uni-tables.c (fill_properties): Recognize Sentence_Terminal and Prepended_Concatenation_Mark. (is_property_default_ignorable_code_point): Exclude U+08E2. (fill_arabicshaping): Allow missing whitespace when parsing; recognize "AFRICAN FEH", "AFRICAN QAF", and "AFRICAN MOON". (output_blocks): Increase the element size of the level1 table to accommodate more blocks. (get_lbp): Recognize ZWJ, E_Base, and E_Modifier characters; Update each class according to the standard. (get_wbp): Recognize ZWJ, E_Base, E_Modifier, Glue_After_Zwj, and E_Base_GAZ characters. (output_gbp_table): Recognize ZWJ, E_Base, E_Modifier, Glue_After_Zwj, and E_Base_GAZ characters. * lib/unictype.in.h (UC_JOINING_GROUP_AFRICAN_FEH, UC_JOINING_GROUP_AFRICAN_QAF, UC_JOINING_GROUP_AFRICAN_MOON): New enum value. * lib/unilbrk/lbrktables.h (LBP_ZWJ, LBP_EB, LBP_EM): New enum value. * lib/unilbrk/lbrktables.c (unilbrk_table): Extend the table with LBP_ZWJ, LBP_EB, and LBP_EM. * lib/uniwbrk.in.h (WBP_ZWJ, WBP_EB, WBP_EM, WBP_GAZ, WBP_EBG): New enum value. * lib/uniwbrk/u-wordbreaks.h: Implement WB3c, WB15, and WB16. * lib/uniwbrk/wbrktable.h (uniwbrk_prop_index): New variable declaration. * lib/uniwbrk/wbrktable.c (uniwbrk_prop_index): New variable. (uniwbrk_table): Implement WB14. * tests/uniwbrk/test-uc-wordbreaks.c (wordbreakproperty_to_string): Check WBP_ZWJ, WBP_EB, WBP_EM, WBP_GAZ, and WBP_EBG. * modules/unigbrk/u{32,16,8}-grapheme-breaks: No longer depend on uc-is-grapheme-break. * modules/unigbrk/uc-grapheme-breaks: New module. * modules/unigbrk/uc-grapheme-breaks-tests: New module. * lib/unigbrk.in.h (GBP_ZWJ, GBP_EB, GBP_EM, GBP_GAZ, GBP_EBG): New enum value. (uc_grapheme_breaks): New function, replacing uc_is_grapheme_break. * lib/unigbrk/u-grapheme-breaks.h: New file. * lib/unigbrk/u{32,16,8}-grapheme-breaks.c: Rewrite using u-grapheme-breaks.h instead of uc_is_grapheme_break. * lib/unigbrk/uc-grapheme-breaks.c: New file. * lib/unigbrk/uc-is-grapheme-break.c: Partially update to TR29 rev 29. * tests/unigbrk/test-uc-gbrk-prop.c (graphemebreakproperty_to_string): Check GBP_ZWJ, GBP_EB, GBP_EM, GBP_GAZ, and GBP_EBG. * tests/unigbrk/test-uc-grapheme-breaks.c: New test. * tests/unigbrk/test-uc-is-grapheme-break.c (graphemebreakproperty_to_string): Check GBP_ZWJ, GBP_EB, GBP_EM, GBP_GAZ, and GBP_EBG. (main): Skip unsupported rules involving 3 or more characters, namely GB10, GB12, and GB13. * lib/uniwidth/width.c (nonspacing_table_data): Update.
author Daiki Ueno <ueno@gnu.org>
date Wed, 12 Oct 2016 17:40:37 +0200
parents 9759915b2aca
children 10eb9086bea0
line wrap: on
line diff
--- a/lib/uniwbrk/u-wordbreaks.h	Sun Oct 29 16:22:41 2017 -0700
+++ b/lib/uniwbrk/u-wordbreaks.h	Wed Oct 12 17:40:37 2016 +0200
@@ -39,6 +39,8 @@
          -1 at the very beginning of the string.  */
       int secondlast_compchar_prop = -1;
 
+      size_t ri_count = 0;
+
       /* Don't break inside multibyte characters.  */
       memset (p, 0, n);
 
@@ -51,10 +53,10 @@
           /* No break at the start of the string.  */
           if (last_char_prop >= 0)
             {
-              /* No break between CR and LF.  */
+              /* No break between CR and LF (WB3).  */
               if (last_char_prop == WBP_CR && prop == WBP_LF)
                 /* *p = 0 */;
-              /* Break before and after newlines.  */
+              /* Break before and after newlines (WB3a, WB3b).  */
               else if ((last_char_prop == WBP_CR
                         || last_char_prop == WBP_LF
                         || last_char_prop == WBP_NEWLINE)
@@ -62,8 +64,12 @@
                            || prop == WBP_LF
                            || prop == WBP_NEWLINE))
                 *p = 1;
+              /* No break within emoji zwj sequence (WB3c).  */
+              else if (last_char_prop == WBP_ZWJ &&
+                       (prop == WBP_GAZ || prop == WBP_EBG))
+                /* *p = 0 */;
               /* Ignore Format and Extend characters.  */
-              else if (!(prop == WBP_EXTEND || prop == WBP_FORMAT))
+              else if (!(prop == WBP_EXTEND || prop == WBP_FORMAT || prop == WBP_ZWJ))
                 {
                   /* No break in these situations (see UAX #29):
 
@@ -75,16 +81,8 @@
                   Numeric × (MidNum | MidNumLet | SQ)      Numeric      (WB12)
                                                         HL × DQ HL      (WB7b)
                                                         HL DQ × HL      (WB7c)
-                                   (ALetter | HL) × (ALetter | HL)      (WB5)
-                                          (ALetter | HL) × Numeric      (WB9)
-                                          Numeric × (ALetter | HL)      (WB10)
-                                                 Numeric × Numeric      (WB8)
-                                                      HL × SQ           (WB7a)
-                                                Katakana × Katakana     (WB13)
-                     (ALetter | HL | Numeric | Katakana) × ExtendNumLet (WB13a)
-                                            ExtendNumLet × ExtendNumLet (WB13a)
-                    ExtendNumLet × (ALetter | HL | Numeric | Katakana)  (WB13b)
-                               Regional_Indicator × Regional_Indicator  (WB13c)
+                                                ^ (RI RI)* RI × RI      (WB15)
+                                            [^RI] (RI RI)* RI × RI      (WB16)
                    */
                   /* No break across certain punctuation.  Also, disable word
                      breaks that were recognized earlier (due to lookahead of
@@ -108,27 +106,29 @@
                       *last_compchar_ptr = 0;
                       /* *p = 0; */
                     }
-                  /* Break after Format and Extend characters.  */
+                  /* Break before RI, if odd number of RI's are
+                     preceding (WB15, WB16).  */
+                  else if (last_compchar_prop == WBP_RI && prop == WBP_RI)
+                    {
+                      if (ri_count % 2 == 0)
+                        *p = 1;
+                      /* else *p = 0 */
+                    }
+                  /* Break after Format and Extend character.  */
                   else if (last_compchar_prop == WBP_EXTEND
                            || last_compchar_prop == WBP_FORMAT)
                     *p = 1;
                   else
                     {
-                      /* Normalize property value to table index,
-                         skipping 5 properties: WBP_EXTEND,
-                         WBP_FORMAT, WBP_NEWLINE, WBP_CR, and
-                         WBP_LF.  */
-                      int last_compchar_prop_index = last_compchar_prop;
-                      int prop_index = prop;
+                      int last_compchar_index =
+                        uniwbrk_prop_index[last_compchar_prop];
+                      int index = uniwbrk_prop_index[prop];
 
-                      if (last_compchar_prop_index >= WBP_EXTEND)
-                        last_compchar_prop_index -= 5;
-
-                      if (prop_index >= WBP_EXTEND)
-                        prop_index -= 5;
-
+                      /* Break between unknown pair (WB999).  */
+                      if (last_compchar_index < 0 || index < 0)
+                        *p = 1;
                       /* Perform a single table lookup.  */
-                      if (uniwbrk_table[last_compchar_prop_index][prop_index])
+                      else if (uniwbrk_table[last_compchar_index][index])
                         *p = 1;
                       /* else *p = 0; */
                     }
@@ -136,17 +136,23 @@
             }
 
           last_char_prop = prop;
-          /* Ignore Format and Extend characters, except at the start
-             of the line.  */
+
+          /* Ignore Format and Extend characters, except at the
+             start of the line.  */
           if (last_compchar_prop < 0
               || last_compchar_prop == WBP_CR
               || last_compchar_prop == WBP_LF
               || last_compchar_prop == WBP_NEWLINE
-              || !(prop == WBP_EXTEND || prop == WBP_FORMAT))
+              || !(prop == WBP_EXTEND || prop == WBP_FORMAT || prop == WBP_ZWJ))
             {
               secondlast_compchar_prop = last_compchar_prop;
               last_compchar_prop = prop;
               last_compchar_ptr = p;
+
+              if (prop == WBP_RI)
+                ri_count++;
+              else
+                ri_count = 0;
             }
 
           s += count;