annotate lib/unictype/categ_of.c @ 40210:44073ad4207f

unictype/numeric: Fix undefined behaviour. Reported by Jeffrey Walton <noloader@gmail.com>. * lib/unictype/numeric.c (uc_numeric_value): Avoid undefined behaviour on shift overflow, caught by "gcc -fsanitize=undefined". * lib/unictype/bidi_of.c (uc_bidi_class): Add cast, for clarity. * lib/unictype/categ_of.c (lookup_withtable): Likewise. * lib/unictype/joininggroup_of.c (uc_joining_group): Likewise.
author Bruno Haible <bruno@clisp.org>
date Fri, 08 Mar 2019 19:17:37 +0100
parents b06060465f09
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
9471
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
1 /* Categories of Unicode characters.
40057
b06060465f09 maint: Run 'make update-copyright'
Paul Eggert <eggert@cs.ucla.edu>
parents: 19484
diff changeset
2 Copyright (C) 2002, 2006-2007, 2009-2019 Free Software Foundation, Inc.
9471
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
3 Written by Bruno Haible <bruno@clisp.org>, 2002.
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
4
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
5 This program is free software: you can redistribute it and/or modify it
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
6 under the terms of the GNU Lesser General Public License as published
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
7 by the Free Software Foundation; either version 3 of the License, or
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
8 (at your option) any later version.
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
9
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
10 This program is distributed in the hope that it will be useful,
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
13 Lesser General Public License for more details.
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
14
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
15 You should have received a copy of the GNU Lesser General Public License
19190
9759915b2aca all: prefer https: URLs
Paul Eggert <eggert@cs.ucla.edu>
parents: 18626
diff changeset
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
9471
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
17
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
18 #include <config.h>
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
19
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
20 /* Specification. */
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
21 #include "unictype.h"
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
22
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
23 /* Define u_category table. */
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
24 #include "categ_of.h"
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
25
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
26 static inline int
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
27 lookup_withtable (ucs4_t uc)
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
28 {
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
29 unsigned int index1 = uc >> category_header_0;
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
30 if (index1 < category_header_1)
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
31 {
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
32 int lookup1 = u_category.level1[index1];
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
33 if (lookup1 >= 0)
12421
e8d2c6fc33ad Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents: 9471
diff changeset
34 {
e8d2c6fc33ad Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents: 9471
diff changeset
35 unsigned int index2 = (uc >> category_header_2) & category_header_3;
e8d2c6fc33ad Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents: 9471
diff changeset
36 int lookup2 = u_category.level2[lookup1 + index2];
e8d2c6fc33ad Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents: 9471
diff changeset
37 if (lookup2 >= 0)
e8d2c6fc33ad Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents: 9471
diff changeset
38 {
e8d2c6fc33ad Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents: 9471
diff changeset
39 unsigned int index3 = ((uc & category_header_4) + lookup2) * 5;
e8d2c6fc33ad Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents: 9471
diff changeset
40 /* level3 contains 5-bit values, packed into 16-bit words. */
e8d2c6fc33ad Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents: 9471
diff changeset
41 unsigned int lookup3 =
40210
44073ad4207f unictype/numeric: Fix undefined behaviour.
Bruno Haible <bruno@clisp.org>
parents: 40057
diff changeset
42 (((unsigned int) u_category.level3[index3>>4]
17880
18371cbd9692 unictype: avoid undefined left-shift behavior
Daiki Ueno <ueno@gnu.org>
parents: 17848
diff changeset
43 | ((unsigned int) u_category.level3[(index3>>4)+1] << 16))
12421
e8d2c6fc33ad Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents: 9471
diff changeset
44 >> (index3 % 16))
e8d2c6fc33ad Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents: 9471
diff changeset
45 & 0x1f;
9471
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
46
12421
e8d2c6fc33ad Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents: 9471
diff changeset
47 return lookup3;
e8d2c6fc33ad Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents: 9471
diff changeset
48 }
e8d2c6fc33ad Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents: 9471
diff changeset
49 }
9471
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
50 return 29; /* = log2(UC_CATEGORY_MASK_Cn) */
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
51 }
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
52 return -1;
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
53 }
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
54
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
55 bool
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
56 uc_is_general_category_withtable (ucs4_t uc, uint32_t bitmask)
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
57 {
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
58 int bit = lookup_withtable (uc);
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
59
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
60 if (bit >= 0)
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
61 return ((bitmask >> bit) & 1);
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
62 else
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
63 return false;
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
64 }
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
65
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
66 uc_general_category_t
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
67 uc_general_category (ucs4_t uc)
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
68 {
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
69 int bit = lookup_withtable (uc);
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
70 uc_general_category_t result;
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
71
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
72 if (bit >= 0)
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
73 {
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
74 result.bitmask = 1 << bit;
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
75 result.generic = 1;
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
76 result.lookup.lookup_fn = &uc_is_general_category_withtable;
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
77 return result;
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
78 }
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
79 else
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
80 return _UC_CATEGORY_NONE;
6dc73c76eced Unicode character classification functions.
Bruno Haible <bruno@clisp.org>
parents:
diff changeset
81 }