changeset 30994:aea422199f47

Generate more tables for case conversion and case folding.
author Bruno Haible <bruno@clisp.org>
date Sun, 08 Mar 2009 14:08:52 +0100
parents b0019d3a5862
children e675fa707aa6
files ChangeLog lib/gen-uni-tables.c
diffstat 2 files changed, 791 insertions(+), 4 deletions(-) [+]
line wrap: on
line diff
--- a/ChangeLog	Sun Mar 08 14:00:35 2009 +0100
+++ b/ChangeLog	Sun Mar 08 14:08:52 2009 +0100
@@ -1,5 +1,23 @@
 2009-03-08  Bruno Haible  <bruno@clisp.org>
 
+	Generate more tables for case conversion and case folding.
+	* lib/gen-uni-tables.c (SCC_*): New enum items.
+	(struct special_casing_rule): New type.
+	(casing_rules, num_casing_rules, allocated_casing_rules): New
+	variables.
+	(add_casing_rule, fill_casing_rules): New functions.
+	(struct casefold_rule): New type.
+	(casefolding_rules, num_casefolding_rules,
+	allocated_casefolding_rules): New variables.
+	(fill_casefolding_rules): New function.
+	(unicode_casefold): New variable.
+	(to_casefold, redistribute_casefolding_rules, compare_casing_rules,
+	sort_casing_rules, output_casing_rules): New functions.
+	(main): Accept to more arguments: SpecialCasing.txt and
+	CaseFolding.txt. Invoke fill_casing_rules, fill_casefolding_rules,
+	redistribute_casefolding_rules, sort_casing_rules, output_casing_rules.
+	Output mapping for casefolding.
+
 	* lib/unicase.h: Include stdbool.h, uninorm.h.
 	(u8_toupper, u16_toupper, u32_toupper, u8_tolower, u16_tolower,
 	u32_tolower, u8_totitle, u16_totitle, u32_totitle): Add nf argument.
--- a/lib/gen-uni-tables.c	Sun Mar 08 14:00:35 2009 +0100
+++ b/lib/gen-uni-tables.c	Sun Mar 08 14:08:52 2009 +0100
@@ -28,6 +28,8 @@
                       /usr/local/share/Unidata/LineBreak.txt \
                       /usr/local/share/Unidata/WordBreakProperty.txt \
                       /usr/local/share/Unidata/CompositionExclusions.txt \
+                      /usr/local/share/Unidata/SpecialCasing.txt \
+                      /usr/local/share/Unidata/CaseFolding.txt \
                       5.1.0
  */
 
@@ -7453,6 +7455,761 @@
 
 /* ========================================================================= */
 
+/* A special casing context.
+   A context is negated through x -> -x.  */
+enum
+{
+  SCC_ALWAYS             = 0,
+  SCC_FINAL_SIGMA,
+  SCC_AFTER_SOFT_DOTTED,
+  SCC_MORE_ABOVE,
+  SCC_BEFORE_DOT,
+  SCC_AFTER_I
+};
+
+/* A special casing rule.  */
+struct special_casing_rule
+{
+  unsigned int code;
+  unsigned int lower_mapping[3];
+  unsigned int title_mapping[3];
+  unsigned int upper_mapping[3];
+  unsigned int casefold_mapping[3];
+  const char *language;
+  int context;
+};
+
+/* The special casing rules.  */
+struct special_casing_rule **casing_rules;
+unsigned int num_casing_rules;
+unsigned int allocated_casing_rules;
+
+static void
+add_casing_rule (struct special_casing_rule *new_rule)
+{
+  if (num_casing_rules == allocated_casing_rules)
+    {
+      allocated_casing_rules = 2 * allocated_casing_rules;
+      if (allocated_casing_rules < 16)
+	allocated_casing_rules = 16;
+      casing_rules =
+	(struct special_casing_rule **)
+	realloc (casing_rules, allocated_casing_rules * sizeof (struct special_casing_rule *));
+    }
+  casing_rules[num_casing_rules++] = new_rule;
+}
+
+/* Stores in casing_rules the special casing rules found in
+   specialcasing_filename.  */
+static void
+fill_casing_rules (const char *specialcasing_filename)
+{
+  FILE *stream;
+
+  stream = fopen (specialcasing_filename, "r");
+  if (stream == NULL)
+    {
+      fprintf (stderr, "error during fopen of '%s'\n", specialcasing_filename);
+      exit (1);
+    }
+
+  casing_rules = NULL;
+  num_casing_rules = 0;
+  allocated_casing_rules = 0;
+
+  for (;;)
+    {
+      char buf[200+1];
+      char *scanptr;
+      char *endptr;
+      int i;
+
+      unsigned int code;
+      unsigned int lower_mapping[3];
+      unsigned int title_mapping[3];
+      unsigned int upper_mapping[3];
+      char *language;
+      int context;
+
+      if (fscanf (stream, "%200[^\n]\n", buf) < 1)
+	break;
+
+      if (buf[0] == '\0' || buf[0] == '#')
+	continue;
+
+      /* Scan code.  */
+      scanptr = buf;
+      code = strtoul (scanptr, &endptr, 16);
+      if (endptr == scanptr)
+	{
+	  fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
+	  exit (1);
+	}
+      scanptr = endptr;
+      if (*scanptr != ';')
+	{
+	  fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
+	  exit (1);
+	}
+      scanptr++;
+
+      /* Scan lower mapping.  */
+      for (i = 0; i < 3; i++)
+	lower_mapping[i] = 0;
+      for (i = 0; i < 3; i++)
+	{
+	  while (*scanptr == ' ')
+	    scanptr++;
+	  if (*scanptr == ';')
+	    break;
+	  lower_mapping[i] = strtoul (scanptr, &endptr, 16);
+	  if (endptr == scanptr)
+	    {
+	      fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
+	      exit (1);
+	    }
+	  scanptr = endptr;
+	}
+      if (*scanptr != ';')
+	{
+	  fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
+	  exit (1);
+	}
+      scanptr++;
+
+      /* Scan title mapping.  */
+      for (i = 0; i < 3; i++)
+	title_mapping[i] = 0;
+      for (i = 0; i < 3; i++)
+	{
+	  while (*scanptr == ' ')
+	    scanptr++;
+	  if (*scanptr == ';')
+	    break;
+	  title_mapping[i] = strtoul (scanptr, &endptr, 16);
+	  if (endptr == scanptr)
+	    {
+	      fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
+	      exit (1);
+	    }
+	  scanptr = endptr;
+	}
+      if (*scanptr != ';')
+	{
+	  fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
+	  exit (1);
+	}
+      scanptr++;
+
+      /* Scan upper mapping.  */
+      for (i = 0; i < 3; i++)
+	upper_mapping[i] = 0;
+      for (i = 0; i < 3; i++)
+	{
+	  while (*scanptr == ' ')
+	    scanptr++;
+	  if (*scanptr == ';')
+	    break;
+	  upper_mapping[i] = strtoul (scanptr, &endptr, 16);
+	  if (endptr == scanptr)
+	    {
+	      fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
+	      exit (1);
+	    }
+	  scanptr = endptr;
+	}
+      if (*scanptr != ';')
+	{
+	  fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
+	  exit (1);
+	}
+      scanptr++;
+
+      /* Scan language and context.  */
+      language = NULL;
+      context = SCC_ALWAYS;
+      while (*scanptr == ' ')
+	scanptr++;
+      if (*scanptr != '\0' && *scanptr != '#')
+	{
+	  const char *word_begin = scanptr;
+	  const char *word_end;
+
+	  while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ')
+	    scanptr++;
+	  word_end = scanptr;
+
+	  while (*scanptr == ' ')
+	    scanptr++;
+
+	  if (word_end - word_begin == 2)
+	    {
+	      language = (char *) malloc ((word_end - word_begin) + 1);
+	      memcpy (language, word_begin, 2);
+	      language[word_end - word_begin] = '\0';
+	      word_begin = word_end = NULL;
+
+	      if (*scanptr != '\0' && *scanptr != '#' &&  *scanptr != ';')
+		{
+		  word_begin = scanptr;
+		  while (*scanptr != '\0' && *scanptr != '#' && *scanptr != ';' && *scanptr != ' ')
+		    scanptr++;
+		  word_end = scanptr;
+		}
+	    }
+
+	  if (word_end > word_begin)
+	    {
+	      bool negate = false;
+
+	      if (word_end - word_begin >= 4 && memcmp (word_begin, "Not_", 4) == 0)
+		{
+		  word_begin += 4;
+		  negate = true;
+		}
+	      if (word_end - word_begin == 11 && memcmp (word_begin, "Final_Sigma", 11) == 0)
+		context = SCC_FINAL_SIGMA;
+	      else if (word_end - word_begin == 17 && memcmp (word_begin, "After_Soft_Dotted", 17) == 0)
+		context = SCC_AFTER_SOFT_DOTTED;
+	      else if (word_end - word_begin == 10 && memcmp (word_begin, "More_Above", 10) == 0)
+		context = SCC_MORE_ABOVE;
+	      else if (word_end - word_begin == 10 && memcmp (word_begin, "Before_Dot", 10) == 0)
+		context = SCC_BEFORE_DOT;
+	      else if (word_end - word_begin == 7 && memcmp (word_begin, "After_I", 7) == 0)
+		context = SCC_AFTER_I;
+	      else
+		{
+		  fprintf (stderr, "unknown context type in '%s'\n", specialcasing_filename);
+		  exit (1);
+		}
+	      if (negate)
+		context = - context;
+	    }
+
+	  if (*scanptr != '\0' && *scanptr != '#' &&  *scanptr != ';')
+	    {
+	      fprintf (stderr, "parse error in '%s'\n", specialcasing_filename);
+	      exit (1);
+	    }
+	}
+
+      /* Store the rule.  */
+      {
+	struct special_casing_rule *new_rule =
+	  (struct special_casing_rule *) malloc (sizeof (struct special_casing_rule));
+	new_rule->code = code;
+	new_rule->language = language;
+	new_rule->context = context;
+	memcpy (new_rule->lower_mapping, lower_mapping, sizeof (new_rule->lower_mapping));
+	memcpy (new_rule->title_mapping, title_mapping, sizeof (new_rule->title_mapping));
+	memcpy (new_rule->upper_mapping, upper_mapping, sizeof (new_rule->upper_mapping));
+
+	add_casing_rule (new_rule);
+      }
+    }
+
+  if (ferror (stream) || fclose (stream))
+    {
+      fprintf (stderr, "error reading from '%s'\n", specialcasing_filename);
+      exit (1);
+    }
+}
+
+/* A casefolding rule.  */
+struct casefold_rule
+{
+  unsigned int code;
+  unsigned int mapping[3];
+  const char *language;
+};
+
+/* The casefolding rules.  */
+struct casefold_rule **casefolding_rules;
+unsigned int num_casefolding_rules;
+unsigned int allocated_casefolding_rules;
+
+/* Stores in casefolding_rules the case folding rules found in
+   casefolding_filename.  */
+static void
+fill_casefolding_rules (const char *casefolding_filename)
+{
+  FILE *stream;
+
+  stream = fopen (casefolding_filename, "r");
+  if (stream == NULL)
+    {
+      fprintf (stderr, "error during fopen of '%s'\n", casefolding_filename);
+      exit (1);
+    }
+
+  casefolding_rules = NULL;
+  num_casefolding_rules = 0;
+  allocated_casefolding_rules = 0;
+
+  for (;;)
+    {
+      char buf[200+1];
+      char *scanptr;
+      char *endptr;
+      int i;
+
+      unsigned int code;
+      char type;
+      unsigned int mapping[3];
+
+      if (fscanf (stream, "%200[^\n]\n", buf) < 1)
+	break;
+
+      if (buf[0] == '\0' || buf[0] == '#')
+	continue;
+
+      /* Scan code.  */
+      scanptr = buf;
+      code = strtoul (scanptr, &endptr, 16);
+      if (endptr == scanptr)
+	{
+	  fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
+	  exit (1);
+	}
+      scanptr = endptr;
+      if (*scanptr != ';')
+	{
+	  fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
+	  exit (1);
+	}
+      scanptr++;
+
+      /* Scan type.  */
+      while (*scanptr == ' ')
+	scanptr++;
+
+      switch (*scanptr)
+	{
+	case 'C': case 'F': case 'S': case 'T':
+	  type = *scanptr;
+	  break;
+	default:
+	  fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
+	  exit (1);
+	}
+      scanptr++;
+      if (*scanptr != ';')
+	{
+	  fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
+	  exit (1);
+	}
+      scanptr++;
+
+      /* Scan casefold mapping.  */
+      for (i = 0; i < 3; i++)
+	mapping[i] = 0;
+      for (i = 0; i < 3; i++)
+	{
+	  while (*scanptr == ' ')
+	    scanptr++;
+	  if (*scanptr == ';')
+	    break;
+	  mapping[i] = strtoul (scanptr, &endptr, 16);
+	  if (endptr == scanptr)
+	    {
+	      fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
+	      exit (1);
+	    }
+	  scanptr = endptr;
+	}
+      if (*scanptr != ';')
+	{
+	  fprintf (stderr, "parse error in '%s'\n", casefolding_filename);
+	  exit (1);
+	}
+      scanptr++;
+
+      /* Ignore rules of type 'S'; we use the rules of type 'F' instead.  */
+      if (type != 'S')
+	{
+	  const char * const *languages;
+	  unsigned int languages_count;
+
+	  /* Type 'T' indicates that the rule is applicable to Turkish
+	     languages only.  */
+	  if (type == 'T')
+	    {
+	      static const char * const turkish_languages[] = { "tr", "az" };
+	      languages = turkish_languages;
+	      languages_count = 2;
+	    }
+	  else
+	    {
+	      static const char * const all_languages[] = { NULL };
+	      languages = all_languages;
+	      languages_count = 1;
+	    }
+
+	  for (i = 0; i < languages_count; i++)
+	    {
+	      /* Store a new rule.  */
+	      struct casefold_rule *new_rule =
+		(struct casefold_rule *) malloc (sizeof (struct casefold_rule));
+	      new_rule->code = code;
+	      memcpy (new_rule->mapping, mapping, sizeof (new_rule->mapping));
+	      new_rule->language = languages[i];
+
+	      if (num_casefolding_rules == allocated_casefolding_rules)
+		{
+		  allocated_casefolding_rules = 2 * allocated_casefolding_rules;
+		  if (allocated_casefolding_rules < 16)
+		    allocated_casefolding_rules = 16;
+		  casefolding_rules =
+		    (struct casefold_rule **)
+		    realloc (casefolding_rules,
+			     allocated_casefolding_rules * sizeof (struct casefold_rule *));
+		}
+	      casefolding_rules[num_casefolding_rules++] = new_rule;
+	    }
+	}
+    }
+
+  if (ferror (stream) || fclose (stream))
+    {
+      fprintf (stderr, "error reading from '%s'\n", casefolding_filename);
+      exit (1);
+    }
+}
+
+/* Casefold mapping, when it maps to a single character.  */
+unsigned int unicode_casefold[0x110000];
+
+static unsigned int
+to_casefold (unsigned int ch)
+{
+  return unicode_casefold[ch];
+}
+
+/* Redistribute the casefolding_rules:
+   - Rules that map to a single character, language independently, are stored
+     in unicode_casefold.
+   - Other rules are merged into casing_rules.  */
+static void
+redistribute_casefolding_rules (void)
+{
+  unsigned int ch, i, j;
+
+  /* Fill unicode_casefold[].  */
+  for (ch = 0; ch < 0x110000; ch++)
+    unicode_casefold[ch] = ch;
+  for (i = 0; i < num_casefolding_rules; i++)
+    {
+      struct casefold_rule *cfrule = casefolding_rules[i];
+
+      if (cfrule->language == NULL && cfrule->mapping[1] == 0)
+	{
+	  ch = cfrule->code;
+	  if (!(ch < 0x110000))
+	    abort ();
+	  unicode_casefold[ch] = cfrule->mapping[0];
+	}
+    }
+
+  /* Extend the special casing rules by filling in their casefold_mapping[]
+     field.  */
+  for (j = 0; j < num_casing_rules; j++)
+    {
+      struct special_casing_rule *rule = casing_rules[j];
+      unsigned int k;
+
+      rule->casefold_mapping[0] = to_casefold (rule->code);
+      for (k = 1; k < 3; k++)
+	rule->casefold_mapping[k] = 0;
+    }
+
+  /* Now merge the other casefolding rules into casing_rules.  */
+  for (i = 0; i < num_casefolding_rules; i++)
+    {
+      struct casefold_rule *cfrule = casefolding_rules[i];
+
+      if (!(cfrule->language == NULL && cfrule->mapping[1] == 0))
+	{
+	  /* Find a rule that applies to the same code, same language, and it
+	     has context SCC_ALWAYS.  At the same time, update all rules that
+	     have the same code and same or more specific language.  */
+	  struct special_casing_rule *found_rule = NULL;
+
+	  for (j = 0; j < num_casing_rules; j++)
+	    {
+	      struct special_casing_rule *rule = casing_rules[j];
+
+	      if (rule->code == cfrule->code
+		  && (cfrule->language == NULL
+		      || (rule->language != NULL
+			  && strcmp (rule->language, cfrule->language) == 0)))
+		{
+		  memcpy (rule->casefold_mapping, cfrule->mapping,
+			  sizeof (rule->casefold_mapping));
+
+		  if ((cfrule->language == NULL
+		       ? rule->language == NULL
+		       : rule->language != NULL
+			 && strcmp (rule->language, cfrule->language) == 0)
+		      && rule->context == SCC_ALWAYS)
+		    {
+		      /* Found it.  */
+		      found_rule = rule;
+		    }
+		}
+	    }
+
+	  if (found_rule == NULL)
+	    {
+	      /* Create a new rule.  */
+	      struct special_casing_rule *new_rule =
+		(struct special_casing_rule *) malloc (sizeof (struct special_casing_rule));
+
+	      /* Try to find a rule that applies to the same code, no language
+		 restriction, and with context SCC_ALWAYS.  */
+	      for (j = 0; j < num_casing_rules; j++)
+		{
+		  struct special_casing_rule *rule = casing_rules[j];
+
+		  if (rule->code == cfrule->code
+		      && rule->context == SCC_ALWAYS
+		      && rule->language == NULL)
+		    {
+		      /* Found it.  */
+		      found_rule = rule;
+		      break;
+		    }
+		}
+
+	      new_rule->code = cfrule->code;
+	      new_rule->language = cfrule->language;
+	      new_rule->context = SCC_ALWAYS;
+	      if (found_rule != NULL)
+		{
+		  memcpy (new_rule->lower_mapping, found_rule->lower_mapping,
+			  sizeof (new_rule->lower_mapping));
+		  memcpy (new_rule->title_mapping, found_rule->title_mapping,
+			  sizeof (new_rule->title_mapping));
+		  memcpy (new_rule->upper_mapping, found_rule->upper_mapping,
+			  sizeof (new_rule->upper_mapping));
+		}
+	      else
+		{
+		  unsigned int k;
+
+		  new_rule->lower_mapping[0] = to_lower (cfrule->code);
+		  for (k = 1; k < 3; k++)
+		    new_rule->lower_mapping[k] = 0;
+		  new_rule->title_mapping[0] = to_title (cfrule->code);
+		  for (k = 1; k < 3; k++)
+		    new_rule->title_mapping[k] = 0;
+		  new_rule->upper_mapping[0] = to_upper (cfrule->code);
+		  for (k = 1; k < 3; k++)
+		    new_rule->upper_mapping[k] = 0;
+		}
+	      memcpy (new_rule->casefold_mapping, cfrule->mapping,
+		      sizeof (new_rule->casefold_mapping));
+
+	      add_casing_rule (new_rule);
+	    }
+	}
+    }
+}
+
+static int
+compare_casing_rules (const void *a, const void *b)
+{
+  struct special_casing_rule *a_rule = *(struct special_casing_rule **) a;
+  struct special_casing_rule *b_rule = *(struct special_casing_rule **) b;
+  unsigned int a_code = a_rule->code;
+  unsigned int b_code = b_rule->code;
+
+  if (a_code < b_code)
+    return -1;
+  if (a_code > b_code)
+    return 1;
+
+  /* Sort the more specific rules before the more general ones.  */
+  return (- ((a_rule->language != NULL ? 1 : 0) + (a_rule->context != SCC_ALWAYS ? 1 : 0))
+	  + ((b_rule->language != NULL ? 1 : 0) + (b_rule->context != SCC_ALWAYS ? 1 : 0)));
+}
+
+static void
+sort_casing_rules (void)
+{
+  /* Sort the rules 1. by code, 2. by specificity.  */
+  if (num_casing_rules > 1)
+    qsort (casing_rules, num_casing_rules, sizeof (struct special_casing_rule *),
+	   compare_casing_rules);
+}
+
+/* Output the special casing rules.  */
+static void
+output_casing_rules (const char *filename, const char *version)
+{
+  FILE *stream;
+  unsigned int i, j;
+  unsigned int minor;
+
+  stream = fopen (filename, "w");
+  if (stream == NULL)
+    {
+      fprintf (stderr, "cannot open '%s' for writing\n", filename);
+      exit (1);
+    }
+
+  fprintf (stream, "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */\n");
+  fprintf (stream, "/* Special casing rules of Unicode characters.  */\n");
+  fprintf (stream, "/* Generated automatically by gen-uni-tables.c for Unicode %s.  */\n",
+	   version);
+  fprintf (stream, "struct special_casing_rule { char code[3]; };\n");
+  fprintf (stream, "%%struct-type\n");
+  fprintf (stream, "%%language=ANSI-C\n");
+  fprintf (stream, "%%define slot-name code\n");
+  fprintf (stream, "%%define hash-function-name gl_unicase_special_hash\n");
+  fprintf (stream, "%%define lookup-function-name gl_unicase_special_lookup\n");
+  fprintf (stream, "%%compare-lengths\n");
+  fprintf (stream, "%%compare-strncmp\n");
+  fprintf (stream, "%%readonly-tables\n");
+  fprintf (stream, "%%omit-struct-type\n");
+  fprintf (stream, "%%%%\n");
+
+  minor = 0;
+  for (i = 0; i < num_casing_rules; i++)
+    {
+      struct special_casing_rule *rule = casing_rules[i];
+      int context;
+
+      if (i > 0 && rule->code == casing_rules[i - 1]->code)
+	minor += 1;
+      else
+	minor = 0;
+
+      if (!(rule->code < 0x10000))
+	{
+	  fprintf (stderr, "special rule #%u: code %u out of range\n", i, rule->code);
+	  exit (1);
+	}
+
+      fprintf (stream, "\"\\x%02x\\x%02x\\x%02x\", ",
+	       (rule->code >> 8) & 0xff, rule->code & 0xff, minor);
+
+      fprintf (stream, "%d, ",
+	       i + 1 < num_casing_rules && casing_rules[i + 1]->code == rule->code ? 1 : 0);
+
+      context = rule->context;
+      if (context < 0)
+	{
+	  fprintf (stream, "-");
+	  context = - context;
+	}
+      else
+	fprintf (stream, " ");
+      switch (context)
+	{
+	case SCC_ALWAYS:
+	  fprintf (stream, "SCC_ALWAYS           ");
+	  break;
+	case SCC_FINAL_SIGMA:
+	  fprintf (stream, "SCC_FINAL_SIGMA      ");
+	  break;
+	case SCC_AFTER_SOFT_DOTTED:
+	  fprintf (stream, "SCC_AFTER_SOFT_DOTTED");
+	  break;
+	case SCC_MORE_ABOVE:
+	  fprintf (stream, "SCC_MORE_ABOVE       ");
+	  break;
+	case SCC_BEFORE_DOT:
+	  fprintf (stream, "SCC_BEFORE_DOT       ");
+	  break;
+	case SCC_AFTER_I:
+	  fprintf (stream, "SCC_AFTER_I          ");
+	  break;
+	default:
+	  abort ();
+	}
+      fprintf (stream, ", ");
+
+      if (rule->language != NULL)
+	{
+	  if (strlen (rule->language) != 2)
+	    abort ();
+	  fprintf (stream, "{  '%c',  '%c' }, ", rule->language[0], rule->language[1]);
+	}
+      else
+	fprintf (stream, "{ '\\0', '\\0' }, ");
+
+      fprintf (stream, "{ ");
+      for (j = 0; j < 3; j++)
+	{
+	  if (j > 0)
+	    fprintf (stream, ", ");
+	  if (!(rule->upper_mapping[j] < 0x10000))
+	    {
+	      fprintf (stderr, "special rule #%u: upper mapping of code %u out of range\n", i, rule->code);
+	      exit (1);
+	    }
+	  if (rule->upper_mapping[j] != 0)
+	    fprintf (stream, "0x%04X", rule->upper_mapping[j]);
+	  else
+	    fprintf (stream, "     0");
+	}
+      fprintf (stream, " }, { ");
+      for (j = 0; j < 3; j++)
+	{
+	  if (j > 0)
+	    fprintf (stream, ", ");
+	  if (!(rule->lower_mapping[j] < 0x10000))
+	    {
+	      fprintf (stderr, "special rule #%u: lower mapping of code %u out of range\n", i, rule->code);
+	      exit (1);
+	    }
+	  if (rule->lower_mapping[j] != 0)
+	    fprintf (stream, "0x%04X", rule->lower_mapping[j]);
+	  else
+	    fprintf (stream, "     0");
+	}
+      fprintf (stream, " }, { ");
+      for (j = 0; j < 3; j++)
+	{
+	  if (j > 0)
+	    fprintf (stream, ", ");
+	  if (!(rule->title_mapping[j] < 0x10000))
+	    {
+	      fprintf (stderr, "special rule #%u: title mapping of code %u out of range\n", i, rule->code);
+	      exit (1);
+	    }
+	  if (rule->title_mapping[j] != 0)
+	    fprintf (stream, "0x%04X", rule->title_mapping[j]);
+	  else
+	    fprintf (stream, "     0");
+	}
+      fprintf (stream, " }, { ");
+      for (j = 0; j < 3; j++)
+	{
+	  if (j > 0)
+	    fprintf (stream, ", ");
+	  if (!(rule->casefold_mapping[j] < 0x10000))
+	    {
+	      fprintf (stderr, "special rule #%u: casefold mapping of code %u out of range\n", i, rule->code);
+	      exit (1);
+	    }
+	  if (rule->casefold_mapping[j] != 0)
+	    fprintf (stream, "0x%04X", rule->casefold_mapping[j]);
+	  else
+	    fprintf (stream, "     0");
+	}
+      fprintf (stream, " }\n");
+    }
+
+  if (ferror (stream) || fclose (stream))
+    {
+      fprintf (stderr, "error writing to '%s'\n", filename);
+      exit (1);
+    }
+}
+
+/* ========================================================================= */
+
 int
 main (int argc, char * argv[])
 {
@@ -7466,11 +8223,13 @@
   const char *linebreak_filename;
   const char *wordbreakproperty_filename;
   const char *compositionexclusions_filename;
+  const char *specialcasing_filename;
+  const char *casefolding_filename;
   const char *version;
 
-  if (argc != 12)
-    {
-      fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt WordBreakProperty.txt CompositionExclusions.txt version\n",
+  if (argc != 14)
+    {
+      fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt DerivedCoreProperties.txt Scripts.txt Blocks.txt PropList-3.0.1.txt EastAsianWidth.txt LineBreak.txt WordBreakProperty.txt CompositionExclusions.txt SpecialCasing.txt CaseFolding.txt version\n",
 	       argv[0]);
       exit (1);
     }
@@ -7485,7 +8244,9 @@
   linebreak_filename = argv[8];
   wordbreakproperty_filename = argv[9];
   compositionexclusions_filename = argv[10];
-  version = argv[11];
+  specialcasing_filename = argv[11];
+  casefolding_filename = argv[12];
+  version = argv[13];
 
   fill_attributes (unicodedata_filename);
   clear_properties ();
@@ -7498,6 +8259,10 @@
   fill_org_lbp (linebreak_filename);
   fill_org_wbp (wordbreakproperty_filename);
   fill_composition_exclusions (compositionexclusions_filename);
+  fill_casing_rules (specialcasing_filename);
+  fill_casefolding_rules (casefolding_filename);
+  redistribute_casefolding_rules ();
+  sort_casing_rules ();
 
   output_categories (version);
   output_category ("unictype/categ_of.h", version);
@@ -7535,6 +8300,8 @@
   output_simple_mapping ("unicase/toupper.h", to_upper, version);
   output_simple_mapping ("unicase/tolower.h", to_lower, version);
   output_simple_mapping ("unicase/totitle.h", to_title, version);
+  output_simple_mapping ("unicase/tocasefold.h", to_casefold, version);
+  output_casing_rules ("unicase/special-casing-table.gperf", version);
 
   return 0;
 }
@@ -7555,6 +8322,8 @@
         /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/LineBreak.txt \
         /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/auxiliary/WordBreakProperty.txt \
         /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/CompositionExclusions.txt \
+        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/SpecialCasing.txt \
+        /gfs/petix/Volumes/ExtData/www-archive/software/i18n/unicode/ftp.unicode.org/ArchiveVersions/5.1.0/ucd/CaseFolding.txt \
         5.1.0
    "
  * End: