changeset 30866:bf2332cf1d3e

New module 'uninorm/base'.
author Bruno Haible <bruno@clisp.org>
date Sat, 21 Feb 2009 12:01:44 +0100
parents 27584770253d
children 140ad9209c77
files ChangeLog lib/unictype.h lib/uninorm.h modules/uninorm/base
diffstat 4 files changed, 184 insertions(+), 2 deletions(-) [+]
line wrap: on
line diff
--- a/ChangeLog	Sat Feb 21 11:10:01 2009 +0100
+++ b/ChangeLog	Sat Feb 21 12:01:44 2009 +0100
@@ -1,3 +1,10 @@
+2009-02-21  Bruno Haible  <bruno@clisp.org>
+
+	New module 'uninorm/base'.
+	* lib/uninorm.h: New file.
+	* lib/unictype.h: Update comment.
+	* modules/uninorm/base: New file.
+
 2009-02-21  David Lutterkort  <lutter@redhat.com>
 
 	Tests for module 'safe-alloc'.
--- a/lib/unictype.h	Sat Feb 21 11:10:01 2009 +0100
+++ b/lib/unictype.h	Sat Feb 21 12:01:44 2009 +0100
@@ -1,5 +1,5 @@
 /* Unicode character classification and properties.
-   Copyright (C) 2002, 2005-2008 Free Software Foundation, Inc.
+   Copyright (C) 2002, 2005-2009 Free Software Foundation, Inc.
 
    This program is free software: you can redistribute it and/or modify it
    under the terms of the GNU Lesser General Public License as published
@@ -301,7 +301,7 @@
 /* ========================================================================= */
 
 /* Field 5 of Unicode Character Database: Character decomposition mapping.
-   See "unicomp.h".  */
+   See "uninorm.h".  */
 
 /* ========================================================================= */
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/lib/uninorm.h	Sat Feb 21 12:01:44 2009 +0100
@@ -0,0 +1,153 @@
+/* Normalization forms (composition and decomposition) of Unicode strings.
+   Copyright (C) 2001-2002, 2009 Free Software Foundation, Inc.
+   Written by Bruno Haible <bruno@clisp.org>, 2009.
+
+   This program is free software: you can redistribute it and/or modify it
+   under the terms of the GNU Lesser General Public License as published
+   by the Free Software Foundation; either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with this program.  If not, see <http://www.gnu.org/licenses/>.  */
+
+#ifndef _UNINORM_H
+#define _UNINORM_H
+
+/* Get size_t.  */
+#include <stddef.h>
+
+#include "unitypes.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Conventions:
+
+   All functions prefixed with u8_ operate on UTF-8 encoded strings.
+   Their unit is an uint8_t (1 byte).
+
+   All functions prefixed with u16_ operate on UTF-16 encoded strings.
+   Their unit is an uint16_t (a 2-byte word).
+
+   All functions prefixed with u32_ operate on UCS-4 encoded strings.
+   Their unit is an uint32_t (a 4-byte word).
+
+   All argument pairs (s, n) denote a Unicode string s[0..n-1] with exactly
+   n units.
+
+   Functions returning a string result take a (resultbuf, lengthp) argument
+   pair.  If resultbuf is not NULL and the result fits into *lengthp units,
+   it is put in resultbuf, and resultbuf is returned.  Otherwise, a freshly
+   allocated string is returned.  In both cases, *lengthp is set to the
+   length (number of units) of the returned string.  In case of error,
+   NULL is returned and errno is set.  */
+
+
+enum
+{
+  UC_DECOMP_CANONICAL,/*            Canonical decomposition.                  */
+  UC_DECOMP_FONT,    /*   <font>    A font variant (e.g. a blackletter form). */
+  UC_DECOMP_NOBREAK, /* <noBreak>   A no-break version of a space or hyphen.  */
+  UC_DECOMP_INITIAL, /* <initial>   An initial presentation form (Arabic).    */
+  UC_DECOMP_MEDIAL,  /*  <medial>   A medial presentation form (Arabic).      */
+  UC_DECOMP_FINAL,   /*  <final>    A final presentation form (Arabic).       */
+  UC_DECOMP_ISOLATED,/* <isolated>  An isolated presentation form (Arabic).   */
+  UC_DECOMP_CIRCLE,  /*  <circle>   An encircled form.                        */
+  UC_DECOMP_SUPER,   /*  <super>    A superscript form.                       */
+  UC_DECOMP_SUB,     /*   <sub>     A subscript form.                         */
+  UC_DECOMP_VERTICAL,/* <vertical>  A vertical layout presentation form.      */
+  UC_DECOMP_WIDE,    /*   <wide>    A wide (or zenkaku) compatibility character. */
+  UC_DECOMP_NARROW,  /*  <narrow>   A narrow (or hankaku) compatibility character. */
+  UC_DECOMP_SMALL,   /*  <small>    A small variant form (CNS compatibility). */
+  UC_DECOMP_SQUARE,  /*  <square>   A CJK squared font variant.               */
+  UC_DECOMP_FRACTION,/* <fraction>  A vulgar fraction form.                   */
+  UC_DECOMP_COMPAT   /*  <compat>   Otherwise unspecified compatibility character. */
+};
+
+/* Maximum size of decomposition of a single Unicode character.  */
+#define UC_DECOMPOSITION_MAX_LENGTH 32
+
+/* Return the character decomposition mapping of a Unicode character.
+   DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH
+   ucs_t elements.
+   When a decomposition exists, DECOMPOSITION[0..N-1] and *DECOMP_TAG are
+   filled and N is returned.  Otherwise -1 is returned.  */
+extern int
+       uc_decomposition (ucs4_t uc, int *decomp_tag, ucs4_t *decomposition);
+
+/* Return the canonical character decomposition mapping of a Unicode character.
+   DECOMPOSITION must point to an array of at least UC_DECOMPOSITION_MAX_LENGTH
+   ucs_t elements.
+   When a decomposition exists, DECOMPOSITION[0..N-1] is filled and N is
+   returned.  Otherwise -1 is returned.  */
+extern int
+       uc_canonical_decomposition (ucs4_t uc, ucs4_t *decomposition);
+
+
+/* Attempt to combine the Unicode characters uc1, uc2.
+   uc1 is known to have canonical combining class 0.
+   Return the combination of uc1 and uc2, if it exists.
+   Return 0 otherwise.
+   Not all decompositions can be recombined using this function.  See the
+   Unicode file CompositionExclusions.txt for details.  */
+extern ucs4_t
+       uc_composition (ucs4_t uc1, ucs4_t uc2);
+
+
+/* An object of type uninorm_t denotes a Unicode normalization form.  */
+struct unicode_normalization_form;
+typedef const struct unicode_normalization_form *uninorm_t;
+
+/* UNINORM_NFD: Normalization form D: canonical decomposition.  */
+extern const struct unicode_normalization_form uninorm_nfd;
+#define UNINORM_NFD (&uninorm_nfd)
+
+/* UNINORM_NFC: Normalization form C: canonical decomposition, then
+   canonical composition.  */
+extern const struct unicode_normalization_form uninorm_nfc;
+#define UNINORM_NFC (&uninorm_nfc)
+
+/* UNINORM_NFKD: Normalization form KD: compatibility decomposition.  */
+extern const struct unicode_normalization_form uninorm_nfkd;
+#define UNINORM_NFKD (&uninorm_nfkd)
+
+/* UNINORM_NFKC: Normalization form KC: compatibility decomposition, then
+   canonical composition.  */
+extern const struct unicode_normalization_form uninorm_nfkc;
+#define UNINORM_NFKC (&uninorm_nfkc)
+
+/* Test whether a normalization form does compatibility decomposition.  */
+#define uninorm_is_compat_decomposing(nf) \
+  ((* (const unsigned int *) (nf) >> 0) & 1)
+
+/* Test whether a normalization form includes canonical composition.  */
+#define uninorm_is_composing(nf) \
+  ((* (const unsigned int *) (nf) >> 1) & 1)
+
+
+/* Return the specified normalization form of a string.  */
+extern uint8_t *
+       u8_normalize (uninorm_t nf, const uint8_t *s, size_t n,
+		     uint8_t *resultbuf, size_t *lengthp);
+extern uint16_t *
+       u16_normalize (uninorm_t nf, const uint16_t *s, size_t n,
+		      uint16_t *resultbuf, size_t *lengthp);
+extern uint32_t *
+       u32_normalize (uninorm_t nf, const uint32_t *s, size_t n,
+		      uint32_t *resultbuf, size_t *lengthp);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif /* _UNINORM_H */
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/modules/uninorm/base	Sat Feb 21 12:01:44 2009 +0100
@@ -0,0 +1,22 @@
+Description:
+Base layer for normalization forms of Unicode strings.
+
+Files:
+lib/uninorm.h
+
+Depends-on:
+unitypes
+
+configure.ac:
+
+Makefile.am:
+
+Include:
+"uninorm.h"
+
+License:
+LGPL
+
+Maintainer:
+Bruno Haible
+