changeset 11206:c7e84b56dbc3

Implement new clarified decomposition of Hangul syllables.
author Bruno Haible <bruno@clisp.org>
date Sun, 22 Feb 2009 15:05:45 +0100
parents 2d19b06ab374
children 4daf474e9033
files ChangeLog lib/uninorm/canonical-decomposition.c lib/uninorm/decomposition.c tests/uninorm/test-canonical-decomposition.c tests/uninorm/test-compat-decomposition.c tests/uninorm/test-decomposition.c
diffstat 6 files changed, 97 insertions(+), 19 deletions(-) [+]
line wrap: on
line diff
--- a/ChangeLog	Sun Feb 22 13:15:52 2009 +0100
+++ b/ChangeLog	Sun Feb 22 15:05:45 2009 +0100
@@ -1,3 +1,14 @@
+2009-02-22  Bruno Haible  <bruno@clisp.org>
+
+	Implement new clarified decomposition of Hangul syllables.
+	* lib/uninorm/decomposition.c (uc_decomposition): For Hangul syllables
+	of type LTV, return only a pairwise decomposition.
+	* lib/uninorm/canonical-decomposition.c (uc_canonical_decomposition):
+	Likewise.
+	* tests/uninorm/test-decomposition.c (main): Updated expected result.
+	* tests/uninorm/test-canonical-decomposition.c (main): Likewise.
+	* tests/uninorm/test-compat-decomposition.c (main): Likewise.
+
 2009-02-22  Bruno Haible  <bruno@clisp.org>
 
 	* lib/uninorm/u-normalize-internal.h (FUNC): At the end, handle
--- a/lib/uninorm/canonical-decomposition.c	Sun Feb 22 13:15:52 2009 +0100
+++ b/lib/uninorm/canonical-decomposition.c	Sun Feb 22 15:05:45 2009 +0100
@@ -29,24 +29,45 @@
 {
   if (uc >= 0xAC00 && uc < 0xD7A4)
     {
-      /* Hangul syllable.  See Unicode standard, chapter 3,
-	 section "Hangul Syllable Decomposition".  */
-      unsigned int t, v, l;
+      /* Hangul syllable.  See Unicode standard, chapter 3, section
+         "Hangul Syllable Decomposition",  See also the clarification at
+	 <http://www.unicode.org/versions/Unicode5.1.0/>, section
+	 "Clarification of Hangul Jamo Handling".  */
+      unsigned int t;
 
       uc -= 0xAC00;
       t = uc % 28;
-      uc = uc / 28;
-      v = uc % 21;
-      l = uc / 21;
+
+      if (t == 0)
+	{
+	  unsigned int v, l;
 
-      decomposition[0] = 0x1100 + l;
-      decomposition[1] = 0x1161 + v;
-      if (t == 0)
-	return 2;
+	  uc = uc / 28;
+	  v = uc % 21;
+	  l = uc / 21;
+
+	  decomposition[0] = 0x1100 + l;
+	  decomposition[1] = 0x1161 + v;
+	  return 2;
+	}
       else
 	{
+#if 1 /* Return the pairwise decomposition, not the full decomposition.  */
+	  decomposition[0] = 0xAC00 + uc - t; /* = 0xAC00 + (l * 21 + v) * 28; */
+	  decomposition[1] = 0x11A7 + t;
+	  return 2;
+#else
+	  unsigned int v, l;
+
+	  uc = uc / 28;
+	  v = uc % 21;
+	  l = uc / 21;
+
+	  decomposition[0] = 0x1100 + l;
+	  decomposition[1] = 0x1161 + v;
 	  decomposition[2] = 0x11A7 + t;
 	  return 3;
+#endif
 	}
     }
   else if (uc < 0x110000)
--- a/lib/uninorm/decomposition.c	Sun Feb 22 13:15:52 2009 +0100
+++ b/lib/uninorm/decomposition.c	Sun Feb 22 15:05:45 2009 +0100
@@ -27,25 +27,46 @@
 {
   if (uc >= 0xAC00 && uc < 0xD7A4)
     {
-      /* Hangul syllable.  See Unicode standard, chapter 3,
-	 section "Hangul Syllable Decomposition".  */
-      unsigned int t, v, l;
+      /* Hangul syllable.  See Unicode standard, chapter 3, section
+         "Hangul Syllable Decomposition",  See also the clarification at
+	 <http://www.unicode.org/versions/Unicode5.1.0/>, section
+	 "Clarification of Hangul Jamo Handling".  */
+      unsigned int t;
 
       uc -= 0xAC00;
       t = uc % 28;
-      uc = uc / 28;
-      v = uc % 21;
-      l = uc / 21;
 
       *decomp_tag = UC_DECOMP_CANONICAL;
-      decomposition[0] = 0x1100 + l;
-      decomposition[1] = 0x1161 + v;
       if (t == 0)
-	return 2;
+	{
+	  unsigned int v, l;
+
+	  uc = uc / 28;
+	  v = uc % 21;
+	  l = uc / 21;
+
+	  decomposition[0] = 0x1100 + l;
+	  decomposition[1] = 0x1161 + v;
+	  return 2;
+	}
       else
 	{
+#if 1 /* Return the pairwise decomposition, not the full decomposition.  */
+	  decomposition[0] = 0xAC00 + uc - t; /* = 0xAC00 + (l * 21 + v) * 28; */
+	  decomposition[1] = 0x11A7 + t;
+	  return 2;
+#else
+	  unsigned int v, l;
+
+	  uc = uc / 28;
+	  v = uc % 21;
+	  l = uc / 21;
+
+	  decomposition[0] = 0x1100 + l;
+	  decomposition[1] = 0x1161 + v;
 	  decomposition[2] = 0x11A7 + t;
 	  return 3;
+#endif
 	}
     }
   else if (uc < 0x110000)
--- a/tests/uninorm/test-canonical-decomposition.c	Sun Feb 22 13:15:52 2009 +0100
+++ b/tests/uninorm/test-canonical-decomposition.c	Sun Feb 22 15:05:45 2009 +0100
@@ -133,10 +133,18 @@
 
   /* HANGUL SYLLABLE GEUL */
   ret = uc_canonical_decomposition (0xAE00, decomposed);
+  /* See the clarification at <http://www.unicode.org/versions/Unicode5.1.0/>,
+     section "Clarification of Hangul Jamo Handling".  */
+#if 1
+  ASSERT (ret == 2);
+  ASSERT (decomposed[0] == 0xADF8);
+  ASSERT (decomposed[1] == 0x11AF);
+#else
   ASSERT (ret == 3);
   ASSERT (decomposed[0] == 0x1100);
   ASSERT (decomposed[1] == 0x1173);
   ASSERT (decomposed[2] == 0x11AF);
+#endif
 
   /* HANGUL SYLLABLE GEU */
   ret = uc_canonical_decomposition (0xADF8, decomposed);
--- a/tests/uninorm/test-compat-decomposition.c	Sun Feb 22 13:15:52 2009 +0100
+++ b/tests/uninorm/test-compat-decomposition.c	Sun Feb 22 15:05:45 2009 +0100
@@ -175,10 +175,18 @@
 
   /* HANGUL SYLLABLE GEUL */
   ret = uc_compat_decomposition (0xAE00, decomposed);
+  /* See the clarification at <http://www.unicode.org/versions/Unicode5.1.0/>,
+     section "Clarification of Hangul Jamo Handling".  */
+#if 1
+  ASSERT (ret == 2);
+  ASSERT (decomposed[0] == 0xADF8);
+  ASSERT (decomposed[1] == 0x11AF);
+#else
   ASSERT (ret == 3);
   ASSERT (decomposed[0] == 0x1100);
   ASSERT (decomposed[1] == 0x1173);
   ASSERT (decomposed[2] == 0x11AF);
+#endif
 
   /* HANGUL SYLLABLE GEU */
   ret = uc_compat_decomposition (0xADF8, decomposed);
--- a/tests/uninorm/test-decomposition.c	Sun Feb 22 13:15:52 2009 +0100
+++ b/tests/uninorm/test-decomposition.c	Sun Feb 22 15:05:45 2009 +0100
@@ -194,11 +194,20 @@
 
   /* HANGUL SYLLABLE GEUL */
   ret = uc_decomposition (0xAE00, &tag, decomposed);
+  /* See the clarification at <http://www.unicode.org/versions/Unicode5.1.0/>,
+     section "Clarification of Hangul Jamo Handling".  */
+#if 1
+  ASSERT (ret == 2);
+  ASSERT (tag == UC_DECOMP_CANONICAL);
+  ASSERT (decomposed[0] == 0xADF8);
+  ASSERT (decomposed[1] == 0x11AF);
+#else
   ASSERT (ret == 3);
   ASSERT (tag == UC_DECOMP_CANONICAL);
   ASSERT (decomposed[0] == 0x1100);
   ASSERT (decomposed[1] == 0x1173);
   ASSERT (decomposed[2] == 0x11AF);
+#endif
 
   /* HANGUL SYLLABLE GEU */
   ret = uc_decomposition (0xADF8, &tag, decomposed);