annotate lib/unigbrk/ulc-grapheme-breaks.c @ 40057:b06060465f09

maint: Run 'make update-copyright'
author Paul Eggert <eggert@cs.ucla.edu>
date Tue, 01 Jan 2019 00:25:11 +0100
parents 10eb9086bea0
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
14076
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
1 /* Grapheme cluster breaks function.
40057
b06060465f09 maint: Run 'make update-copyright'
Paul Eggert <eggert@cs.ucla.edu>
parents: 19484
diff changeset
2 Copyright (C) 2001-2003, 2006-2019 Free Software Foundation, Inc.
14076
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
3 Written by Ben Pfaff <blp@cs.stanford.edu>, 2010,
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
4 based on code written by Bruno Haible <bruno@clisp.org>, 2009.
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
5
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
6 This program is free software: you can redistribute it and/or modify it
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
7 under the terms of the GNU Lesser General Public License as published
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
8 by the Free Software Foundation; either version 3 of the License, or
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
9 (at your option) any later version.
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
10
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
11 This program is distributed in the hope that it will be useful,
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
14 Lesser General Public License for more details.
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
15
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
16 You should have received a copy of the GNU Lesser General Public License
19190
9759915b2aca all: prefer https: URLs
Paul Eggert <eggert@cs.ucla.edu>
parents: 18626
diff changeset
17 along with this program. If not, see <https://www.gnu.org/licenses/>. */
14076
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
18
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
19 #include <config.h>
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
20
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
21 /* Specification. */
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
22 #include "unigbrk.h"
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
23
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
24 #include <stdlib.h>
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
25 #include <string.h>
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
26
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
27 #include "c-ctype.h"
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
28 #include "c-strcaseeq.h"
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
29 #include "localcharset.h"
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
30 #include "uniconv.h"
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
31
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
32 static int
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
33 is_utf8_encoding (const char *encoding)
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
34 {
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
35 if (STRCASEEQ (encoding, "UTF-8", 'U', 'T', 'F', '-', '8', 0, 0, 0, 0))
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
36 return 1;
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
37 return 0;
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
38 }
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
39
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
40 #if C_CTYPE_ASCII
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
41 /* Assume that every ASCII character starts a new grapheme, which is often
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
42 true, except that CR-LF is a single grapheme. */
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
43 static void
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
44 ascii_grapheme_breaks (const char *s, size_t n, char *p)
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
45 {
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
46 size_t i;
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
47
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
48 p[0] = 1;
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
49 for (i = 1; i < n; i++)
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
50 {
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
51 bool is_ascii = c_isprint (s[i]) || c_isspace (s[i]);
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
52 p[i] = is_ascii && (s[i] != '\n' || s[i - 1] != '\r');
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
53 }
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
54 }
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
55 #endif
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
56
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
57 /* Grapheme boundaries in a string in an arbitrary encoding.
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
58
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
59 We convert the input string to Unicode.
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
60
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
61 The standardized Unicode encodings are UTF-8, UCS-2, UCS-4, UTF-16,
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
62 UTF-16BE, UTF-16LE, UTF-7. UCS-2 supports only characters up to
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
63 \U0000FFFF. UTF-16 and variants support only characters up to
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
64 \U0010FFFF. UTF-7 is way too complex and not supported by glibc-2.1.
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
65 UCS-4 specification leaves doubts about endianness and byte order mark.
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
66 glibc currently interprets it as big endian without byte order mark,
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
67 but this is not backed by an RFC. So we use UTF-8. It supports
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
68 characters up to \U7FFFFFFF and is unambiguously defined. */
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
69
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
70 void
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
71 ulc_grapheme_breaks (const char *s, size_t n, char *p)
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
72 {
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
73 if (n > 0)
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
74 {
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
75 const char *encoding = locale_charset ();
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
76
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
77 if (is_utf8_encoding (encoding))
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
78 u8_grapheme_breaks ((const uint8_t *) s, n, p);
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
79 else
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
80 {
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
81 /* Convert the string to UTF-8 and build a translation table
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
82 from offsets into s to offsets into the translated string. */
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
83 size_t *offsets = (size_t *) malloc (n * sizeof (size_t));
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
84
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
85 if (offsets != NULL)
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
86 {
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
87 uint8_t *t;
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
88 size_t m;
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
89
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
90 t = u8_conv_from_encoding (encoding, iconveh_question_mark,
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
91 s, n, offsets, NULL, &m);
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
92 if (t != NULL)
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
93 {
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
94 char *q = (char *) (m > 0 ? malloc (m) : NULL);
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
95
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
96 if (m == 0 || q != NULL)
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
97 {
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
98 size_t i;
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
99
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
100 /* Determine the grapheme breaks of the UTF-8 string. */
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
101 u8_grapheme_breaks (t, m, q);
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
102
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
103 /* Translate the result back to the original string. */
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
104 memset (p, 0, n);
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
105 for (i = 0; i < n; i++)
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
106 if (offsets[i] != (size_t)(-1))
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
107 p[i] = q[offsets[i]];
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
108
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
109 free (q);
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
110 free (t);
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
111 free (offsets);
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
112 return;
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
113 }
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
114 free (t);
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
115 }
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
116 free (offsets);
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
117 }
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
118
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
119 /* Impossible to convert. */
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
120 #if C_CTYPE_ASCII
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
121 /* Fall back to ASCII as best we can. */
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
122 ascii_grapheme_breaks (s, n, p);
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
123 #else
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
124 /* We cannot make any assumptions. */
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
125 p[0] = 1;
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
126 memset (p + 1, 0, n - 1);
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
127 #endif
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
128 }
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
129 }
bf75753bb6d8 unigbrk: New modules for grapheme clusters.
Ben Pfaff <blp@cs.stanford.edu>
parents:
diff changeset
130 }