changeset 11181:60928cee827b

New module 'uninorm/canonical-decomposition'.
author Bruno Haible <bruno@clisp.org>
date Sat, 21 Feb 2009 12:25:26 +0100
parents 756c9e858420
children 867218402200
files ChangeLog lib/uninorm/canonical-decomposition.c modules/uninorm/canonical-decomposition
diffstat 3 files changed, 115 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- a/ChangeLog	Sat Feb 21 12:20:52 2009 +0100
+++ b/ChangeLog	Sat Feb 21 12:25:26 2009 +0100
@@ -1,5 +1,9 @@
 2009-02-21  Bruno Haible  <bruno@clisp.org>
 
+	New module 'uninorm/canonical-decomposition'.
+	* lib/uninorm/canonical-decomposition.c: New file.
+	* modules/uninorm/canonical-decomposition: New file.
+
 	Tests for module 'uninorm/decomposition'.
 	* tests/uninorm/test-decomposition.c: New file.
 	* modules/uninorm/decomposition-tests: New file.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/uninorm/canonical-decomposition.c	Sat Feb 21 12:25:26 2009 +0100
@@ -0,0 +1,87 @@
+/* Canonical decomposition of Unicode characters.
+   Copyright (C) 2009 Free Software Foundation, Inc.
+   Written by Bruno Haible <bruno@clisp.org>, 2009.
+
+   This program is free software: you can redistribute it and/or modify it
+   under the terms of the GNU Lesser General Public License as published
+   by the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+/* Specification.  */
+#include "uninorm.h"
+
+#include <stdlib.h>
+
+#include "decomposition-table.h"
+
+int
+uc_canonical_decomposition (ucs4_t uc, ucs4_t *decomposition)
+{
+  if (uc >= 0xAC00 && uc < 0xD7A4)
+    {
+      /* Hangul syllable.  See Unicode standard, chapter 3,
+	 section "Hangul Syllable Decomposition".  */
+      unsigned int t, v, l;
+
+      uc -= 0xAC00;
+      t = uc % 28;
+      uc = uc / 28;
+      v = uc % 21;
+      l = uc / 21;
+
+      decomposition[0] = 0x1100 + l;
+      decomposition[1] = 0x1161 + v;
+      if (t == 0)
+	return 2;
+      else
+	{
+	  decomposition[2] = 0x11A7 + t;
+	  return 3;
+	}
+    }
+  else if (uc < 0x110000)
+    {
+      unsigned short entry = decomp_index (uc);
+      /* An entry of (unsigned short)(-1) denotes an absent entry.
+	 Otherwise, bit 15 of the entry tells whether the decomposition
+	 is a canonical one.  */
+      if (entry < 0x8000)
+	{
+	  const unsigned char *p;
+	  unsigned int element;
+	  unsigned int length;
+
+	  p = &gl_uninorm_decomp_chars_table[3 * entry];
+	  element = (p[0] << 16) | (p[1] << 8) | p[2];
+	  /* The first element has 5 bits for the decomposition type.  */
+	  if (((element >> 18) & 0x1f) != UC_DECOMP_CANONICAL)
+	    abort ();
+	  length = 1;
+	  for (;;)
+	    {
+	      /* Every element has an 18 bits wide Unicode code point.  */
+	      *decomposition = element & 0x3ffff;
+	      /* Bit 23 tells whether there are more elements,  */
+	      if ((element & (1 << 23)) == 0)
+		break;
+	      p += 3;
+	      element = (p[0] << 16) | (p[1] << 8) | p[2];
+	      decomposition++;
+	      length++;
+	    }
+	  return length;
+	}
+    }
+  return -1;
+}
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/modules/uninorm/canonical-decomposition	Sat Feb 21 12:25:26 2009 +0100
@@ -0,0 +1,24 @@
+Description:
+Canonical decomposition of Unicode characters.
+
+Files:
+lib/uninorm/canonical-decomposition.c
+
+Depends-on:
+uninorm/base
+uninorm/decomposition-table
+
+configure.ac:
+
+Makefile.am:
+lib_SOURCES += uninorm/canonical-decomposition.c
+
+Include:
+"uninorm.h"
+
+License:
+LGPL
+
+Maintainer:
+Bruno Haible
+