mirror of https://gitee.com/Nocallback/glibc.git
Browse Source
for localedata/ChangeLog [BZ #17588] [BZ #13064] [BZ #14094] [BZ #17998] * unicode-gen/Makefile: New. * unicode-gen/unicode-license.txt: New, from Unicode. * unicode-gen/UnicodeData.txt: New, from Unicode. * unicode-gen/DerivedCoreProperties.txt: New, from Unicode. * unicode-gen/EastAsianWidth.txt: New, from Unicode. * unicode-gen/gen_unicode_ctype.py: New generator, from Mike FABIAN <mfabian@redhat.com>. * unicode-gen/ctype_compatibility.py: New verifier, from Pravin Satpute <psatpute@redhat.com> and Mike FABIAN. * unicode-gen/ctype_compatibility_test_cases.py: New verifier module, from Mike FABIAN. * unicode-gen/utf8_gen.py: New generator, from Pravin Satpute and Mike FABIAN. * unicode-gen/utf8_compatibility.py: New verifier, from Pravin Satpute and Mike FABIAN. * charmaps/UTF-8: Update. * locales/i18n: Update. * gen-unicode-ctype.c: Remove. * tst-ctype-de_DE.ISO-8859-1.in: Adjust, islower now returns true for ordinal indicators.hjl/pr18078
16 changed files with 53305 additions and 5382 deletions
File diff suppressed because it is too large
@ -1,784 +0,0 @@ |
|||
/* Generate a Unicode conforming LC_CTYPE category from a UnicodeData file.
|
|||
Copyright (C) 2000-2015 Free Software Foundation, Inc. |
|||
This file is part of the GNU C Library. |
|||
Contributed by Bruno Haible <haible@clisp.cons.org>, 2000. |
|||
|
|||
The GNU C Library is free software; you can redistribute it and/or |
|||
modify it under the terms of the GNU Lesser General Public |
|||
License as published by the Free Software Foundation; either |
|||
version 2.1 of the License, or (at your option) any later version. |
|||
|
|||
The GNU C Library is distributed in the hope that it will be useful, |
|||
but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|||
Lesser General Public License for more details. |
|||
|
|||
You should have received a copy of the GNU Lesser General Public |
|||
License along with the GNU C Library; if not, see |
|||
<http://www.gnu.org/licenses/>. */
|
|||
|
|||
/* Usage example:
|
|||
$ gen-unicode /usr/local/share/Unidata/UnicodeData.txt 3.1 |
|||
*/ |
|||
|
|||
#include <stdio.h> |
|||
#include <stdlib.h> |
|||
#include <stdbool.h> |
|||
#include <string.h> |
|||
#include <time.h> |
|||
|
|||
/* This structure represents one line in the UnicodeData.txt file. */ |
|||
struct unicode_attribute |
|||
{ |
|||
const char *name; /* Character name */ |
|||
const char *category; /* General category */ |
|||
const char *combining; /* Canonical combining classes */ |
|||
const char *bidi; /* Bidirectional category */ |
|||
const char *decomposition; /* Character decomposition mapping */ |
|||
const char *decdigit; /* Decimal digit value */ |
|||
const char *digit; /* Digit value */ |
|||
const char *numeric; /* Numeric value */ |
|||
int mirrored; /* mirrored */ |
|||
const char *oldname; /* Old Unicode 1.0 name */ |
|||
const char *comment; /* Comment */ |
|||
unsigned int upper; /* Uppercase mapping */ |
|||
unsigned int lower; /* Lowercase mapping */ |
|||
unsigned int title; /* Titlecase mapping */ |
|||
}; |
|||
|
|||
/* Missing fields are represented with "" for strings, and NONE for
|
|||
characters. */ |
|||
#define NONE (~(unsigned int)0) |
|||
|
|||
/* The entire contents of the UnicodeData.txt file. */ |
|||
struct unicode_attribute unicode_attributes [0x110000]; |
|||
|
|||
/* Stores in unicode_attributes[i] the values from the given fields. */ |
|||
static void |
|||
fill_attribute (unsigned int i, |
|||
const char *field1, const char *field2, |
|||
const char *field3, const char *field4, |
|||
const char *field5, const char *field6, |
|||
const char *field7, const char *field8, |
|||
const char *field9, const char *field10, |
|||
const char *field11, const char *field12, |
|||
const char *field13, const char *field14) |
|||
{ |
|||
struct unicode_attribute * uni; |
|||
|
|||
if (i >= 0x110000) |
|||
{ |
|||
fprintf (stderr, "index too large\n"); |
|||
exit (1); |
|||
} |
|||
if (strcmp (field2, "Cs") == 0) |
|||
/* Surrogates are UTF-16 artefacts, not real characters. Ignore them. */ |
|||
return; |
|||
uni = &unicode_attributes[i]; |
|||
/* Copy the strings. */ |
|||
uni->name = strdup (field1); |
|||
uni->category = (field2[0] == '\0' ? "" : strdup (field2)); |
|||
uni->combining = (field3[0] == '\0' ? "" : strdup (field3)); |
|||
uni->bidi = (field4[0] == '\0' ? "" : strdup (field4)); |
|||
uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5)); |
|||
uni->decdigit = (field6[0] == '\0' ? "" : strdup (field6)); |
|||
uni->digit = (field7[0] == '\0' ? "" : strdup (field7)); |
|||
uni->numeric = (field8[0] == '\0' ? "" : strdup (field8)); |
|||
uni->mirrored = (field9[0] == 'Y'); |
|||
uni->oldname = (field10[0] == '\0' ? "" : strdup (field10)); |
|||
uni->comment = (field11[0] == '\0' ? "" : strdup (field11)); |
|||
uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16)); |
|||
uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16)); |
|||
uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16)); |
|||
} |
|||
|
|||
/* Maximum length of a field in the UnicodeData.txt file. */ |
|||
#define FIELDLEN 120 |
|||
|
|||
/* Reads the next field from STREAM. The buffer BUFFER has size FIELDLEN.
|
|||
Reads up to (but excluding) DELIM. |
|||
Returns 1 when a field was successfully read, otherwise 0. */ |
|||
static int |
|||
getfield (FILE *stream, char *buffer, int delim) |
|||
{ |
|||
int count = 0; |
|||
int c; |
|||
|
|||
for (; (c = getc (stream)), (c != EOF && c != delim); ) |
|||
{ |
|||
/* The original unicode.org UnicodeData.txt file happens to have
|
|||
CR/LF line terminators. Silently convert to LF. */ |
|||
if (c == '\r') |
|||
continue; |
|||
|
|||
/* Put c into the buffer. */ |
|||
if (++count >= FIELDLEN - 1) |
|||
{ |
|||
fprintf (stderr, "field too long\n"); |
|||
exit (1); |
|||
} |
|||
*buffer++ = c; |
|||
} |
|||
|
|||
if (c == EOF) |
|||
return 0; |
|||
|
|||
*buffer = '\0'; |
|||
return 1; |
|||
} |
|||
|
|||
/* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
|
|||
file. */ |
|||
static void |
|||
fill_attributes (const char *unicodedata_filename) |
|||
{ |
|||
unsigned int i, j; |
|||
FILE *stream; |
|||
char field0[FIELDLEN]; |
|||
char field1[FIELDLEN]; |
|||
char field2[FIELDLEN]; |
|||
char field3[FIELDLEN]; |
|||
char field4[FIELDLEN]; |
|||
char field5[FIELDLEN]; |
|||
char field6[FIELDLEN]; |
|||
char field7[FIELDLEN]; |
|||
char field8[FIELDLEN]; |
|||
char field9[FIELDLEN]; |
|||
char field10[FIELDLEN]; |
|||
char field11[FIELDLEN]; |
|||
char field12[FIELDLEN]; |
|||
char field13[FIELDLEN]; |
|||
char field14[FIELDLEN]; |
|||
int lineno = 0; |
|||
|
|||
for (i = 0; i < 0x110000; i++) |
|||
unicode_attributes[i].name = NULL; |
|||
|
|||
stream = fopen (unicodedata_filename, "r"); |
|||
if (stream == NULL) |
|||
{ |
|||
fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename); |
|||
exit (1); |
|||
} |
|||
|
|||
for (;;) |
|||
{ |
|||
int n; |
|||
|
|||
lineno++; |
|||
n = getfield (stream, field0, ';'); |
|||
n += getfield (stream, field1, ';'); |
|||
n += getfield (stream, field2, ';'); |
|||
n += getfield (stream, field3, ';'); |
|||
n += getfield (stream, field4, ';'); |
|||
n += getfield (stream, field5, ';'); |
|||
n += getfield (stream, field6, ';'); |
|||
n += getfield (stream, field7, ';'); |
|||
n += getfield (stream, field8, ';'); |
|||
n += getfield (stream, field9, ';'); |
|||
n += getfield (stream, field10, ';'); |
|||
n += getfield (stream, field11, ';'); |
|||
n += getfield (stream, field12, ';'); |
|||
n += getfield (stream, field13, ';'); |
|||
n += getfield (stream, field14, '\n'); |
|||
if (n == 0) |
|||
break; |
|||
if (n != 15) |
|||
{ |
|||
fprintf (stderr, "short line in'%s':%d\n", |
|||
unicodedata_filename, lineno); |
|||
exit (1); |
|||
} |
|||
i = strtoul (field0, NULL, 16); |
|||
if (field1[0] == '<' |
|||
&& strlen (field1) >= 9 |
|||
&& !strcmp (field1 + strlen(field1) - 8, ", First>")) |
|||
{ |
|||
/* Deal with a range. */ |
|||
lineno++; |
|||
n = getfield (stream, field0, ';'); |
|||
n += getfield (stream, field1, ';'); |
|||
n += getfield (stream, field2, ';'); |
|||
n += getfield (stream, field3, ';'); |
|||
n += getfield (stream, field4, ';'); |
|||
n += getfield (stream, field5, ';'); |
|||
n += getfield (stream, field6, ';'); |
|||
n += getfield (stream, field7, ';'); |
|||
n += getfield (stream, field8, ';'); |
|||
n += getfield (stream, field9, ';'); |
|||
n += getfield (stream, field10, ';'); |
|||
n += getfield (stream, field11, ';'); |
|||
n += getfield (stream, field12, ';'); |
|||
n += getfield (stream, field13, ';'); |
|||
n += getfield (stream, field14, '\n'); |
|||
if (n != 15) |
|||
{ |
|||
fprintf (stderr, "missing end range in '%s':%d\n", |
|||
unicodedata_filename, lineno); |
|||
exit (1); |
|||
} |
|||
if (!(field1[0] == '<' |
|||
&& strlen (field1) >= 8 |
|||
&& !strcmp (field1 + strlen (field1) - 7, ", Last>"))) |
|||
{ |
|||
fprintf (stderr, "missing end range in '%s':%d\n", |
|||
unicodedata_filename, lineno); |
|||
exit (1); |
|||
} |
|||
field1[strlen (field1) - 7] = '\0'; |
|||
j = strtoul (field0, NULL, 16); |
|||
for (; i <= j; i++) |
|||
fill_attribute (i, field1+1, field2, field3, field4, field5, |
|||
field6, field7, field8, field9, field10, |
|||
field11, field12, field13, field14); |
|||
} |
|||
else |
|||
{ |
|||
/* Single character line */ |
|||
fill_attribute (i, field1, field2, field3, field4, field5, |
|||
field6, field7, field8, field9, field10, |
|||
field11, field12, field13, field14); |
|||
} |
|||
} |
|||
if (ferror (stream) || fclose (stream)) |
|||
{ |
|||
fprintf (stderr, "error reading from '%s'\n", unicodedata_filename); |
|||
exit (1); |
|||
} |
|||
} |
|||
|
|||
/* Character mappings. */ |
|||
|
|||
static unsigned int |
|||
to_upper (unsigned int ch) |
|||
{ |
|||
if (unicode_attributes[ch].name != NULL |
|||
&& unicode_attributes[ch].upper != NONE) |
|||
return unicode_attributes[ch].upper; |
|||
else |
|||
return ch; |
|||
} |
|||
|
|||
static unsigned int |
|||
to_lower (unsigned int ch) |
|||
{ |
|||
if (unicode_attributes[ch].name != NULL |
|||
&& unicode_attributes[ch].lower != NONE) |
|||
return unicode_attributes[ch].lower; |
|||
else |
|||
return ch; |
|||
} |
|||
|
|||
static unsigned int |
|||
to_title (unsigned int ch) |
|||
{ |
|||
if (unicode_attributes[ch].name != NULL |
|||
&& unicode_attributes[ch].title != NONE) |
|||
return unicode_attributes[ch].title; |
|||
else |
|||
return ch; |
|||
} |
|||
|
|||
/* Character class properties. */ |
|||
|
|||
static bool |
|||
is_upper (unsigned int ch) |
|||
{ |
|||
return (to_lower (ch) != ch); |
|||
} |
|||
|
|||
static bool |
|||
is_lower (unsigned int ch) |
|||
{ |
|||
return (to_upper (ch) != ch) |
|||
/* <U00DF> is lowercase, but without simple to_upper mapping. */ |
|||
|| (ch == 0x00DF); |
|||
} |
|||
|
|||
static bool |
|||
is_alpha (unsigned int ch) |
|||
{ |
|||
return (unicode_attributes[ch].name != NULL |
|||
&& ((unicode_attributes[ch].category[0] == 'L' |
|||
/* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
|
|||
<U0E2F>, <U0E46> should belong to is_punct. */ |
|||
&& (ch != 0x0E2F) && (ch != 0x0E46)) |
|||
/* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
|
|||
<U0E31>, <U0E34>..<U0E3A>, <U0E47>..<U0E4E> are is_alpha. */ |
|||
|| (ch == 0x0E31) |
|||
|| (ch >= 0x0E34 && ch <= 0x0E3A) |
|||
|| (ch >= 0x0E47 && ch <= 0x0E4E) |
|||
/* Avoid warning for <U0345>. */ |
|||
|| (ch == 0x0345) |
|||
/* Avoid warnings for <U2160>..<U217F>. */ |
|||
|| (unicode_attributes[ch].category[0] == 'N' |
|||
&& unicode_attributes[ch].category[1] == 'l') |
|||
/* Avoid warnings for <U24B6>..<U24E9>. */ |
|||
|| (unicode_attributes[ch].category[0] == 'S' |
|||
&& unicode_attributes[ch].category[1] == 'o' |
|||
&& strstr (unicode_attributes[ch].name, " LETTER ") |
|||
!= NULL) |
|||
/* Consider all the non-ASCII digits as alphabetic.
|
|||
ISO C 99 forbids us to have them in category "digit", |
|||
but we want iswalnum to return true on them. */ |
|||
|| (unicode_attributes[ch].category[0] == 'N' |
|||
&& unicode_attributes[ch].category[1] == 'd' |
|||
&& !(ch >= 0x0030 && ch <= 0x0039)))); |
|||
} |
|||
|
|||
static bool |
|||
is_digit (unsigned int ch) |
|||
{ |
|||
#if 0 |
|||
return (unicode_attributes[ch].name != NULL |
|||
&& unicode_attributes[ch].category[0] == 'N' |
|||
&& unicode_attributes[ch].category[1] == 'd'); |
|||
/* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
|
|||
a zero. Must add <0> in front of them by hand. */ |
|||
#else |
|||
/* SUSV2 gives us some freedom for the "digit" category, but ISO C 99
|
|||
takes it away: |
|||
7.25.2.1.5: |
|||
The iswdigit function tests for any wide character that corresponds |
|||
to a decimal-digit character (as defined in 5.2.1). |
|||
5.2.1: |
|||
the 10 decimal digits 0 1 2 3 4 5 6 7 8 9 |
|||
*/ |
|||
return (ch >= 0x0030 && ch <= 0x0039); |
|||
#endif |
|||
} |
|||
|
|||
static bool |
|||
is_outdigit (unsigned int ch) |
|||
{ |
|||
return (ch >= 0x0030 && ch <= 0x0039); |
|||
} |
|||
|
|||
static bool |
|||
is_blank (unsigned int ch) |
|||
{ |
|||
return (ch == 0x0009 /* '\t' */ |
|||
/* Category Zs without mention of "<noBreak>" */ |
|||
|| (unicode_attributes[ch].name != NULL |
|||
&& unicode_attributes[ch].category[0] == 'Z' |
|||
&& unicode_attributes[ch].category[1] == 's' |
|||
&& !strstr (unicode_attributes[ch].decomposition, "<noBreak>"))); |
|||
} |
|||
|
|||
static bool |
|||
is_space (unsigned int ch) |
|||
{ |
|||
/* Don't make U+00A0 a space. Non-breaking space means that all programs
|
|||
should treat it like a punctuation character, not like a space. */ |
|||
return (ch == 0x0020 /* ' ' */ |
|||
|| ch == 0x000C /* '\f' */ |
|||
|| ch == 0x000A /* '\n' */ |
|||
|| ch == 0x000D /* '\r' */ |
|||
|| ch == 0x0009 /* '\t' */ |
|||
|| ch == 0x000B /* '\v' */ |
|||
/* Categories Zl, Zp, and Zs without mention of "<noBreak>" */ |
|||
|| (unicode_attributes[ch].name != NULL |
|||
&& unicode_attributes[ch].category[0] == 'Z' |
|||
&& (unicode_attributes[ch].category[1] == 'l' |
|||
|| unicode_attributes[ch].category[1] == 'p' |
|||
|| (unicode_attributes[ch].category[1] == 's' |
|||
&& !strstr (unicode_attributes[ch].decomposition, |
|||
"<noBreak>"))))); |
|||
} |
|||
|
|||
static bool |
|||
is_cntrl (unsigned int ch) |
|||
{ |
|||
return (unicode_attributes[ch].name != NULL |
|||
&& (!strcmp (unicode_attributes[ch].name, "<control>") |
|||
/* Categories Zl and Zp */ |
|||
|| (unicode_attributes[ch].category[0] == 'Z' |
|||
&& (unicode_attributes[ch].category[1] == 'l' |
|||
|| unicode_attributes[ch].category[1] == 'p')))); |
|||
} |
|||
|
|||
static bool |
|||
is_xdigit (unsigned int ch) |
|||
{ |
|||
#if 0 |
|||
return is_digit (ch) |
|||
|| (ch >= 0x0041 && ch <= 0x0046) |
|||
|| (ch >= 0x0061 && ch <= 0x0066); |
|||
#else |
|||
/* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
|
|||
takes it away: |
|||
7.25.2.1.12: |
|||
The iswxdigit function tests for any wide character that corresponds |
|||
to a hexadecimal-digit character (as defined in 6.4.4.1). |
|||
6.4.4.1: |
|||
hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F |
|||
*/ |
|||
return (ch >= 0x0030 && ch <= 0x0039) |
|||
|| (ch >= 0x0041 && ch <= 0x0046) |
|||
|| (ch >= 0x0061 && ch <= 0x0066); |
|||
#endif |
|||
} |
|||
|
|||
static bool |
|||
is_graph (unsigned int ch) |
|||
{ |
|||
return (unicode_attributes[ch].name != NULL |
|||
&& strcmp (unicode_attributes[ch].name, "<control>") |
|||
&& !is_space (ch)); |
|||
} |
|||
|
|||
static bool |
|||
is_print (unsigned int ch) |
|||
{ |
|||
return (unicode_attributes[ch].name != NULL |
|||
&& strcmp (unicode_attributes[ch].name, "<control>") |
|||
/* Categories Zl and Zp */ |
|||
&& !(unicode_attributes[ch].name != NULL |
|||
&& unicode_attributes[ch].category[0] == 'Z' |
|||
&& (unicode_attributes[ch].category[1] == 'l' |
|||
|| unicode_attributes[ch].category[1] == 'p'))); |
|||
} |
|||
|
|||
static bool |
|||
is_punct (unsigned int ch) |
|||
{ |
|||
#if 0 |
|||
return (unicode_attributes[ch].name != NULL |
|||
&& unicode_attributes[ch].category[0] == 'P'); |
|||
#else |
|||
/* The traditional POSIX definition of punctuation is every graphic,
|
|||
non-alphanumeric character. */ |
|||
return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch)); |
|||
#endif |
|||
} |
|||
|
|||
static bool |
|||
is_combining (unsigned int ch) |
|||
{ |
|||
/* Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
|
|||
file. In 3.0.1 it was identical to the union of the general categories |
|||
"Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the |
|||
PropList.txt file, so we take the latter definition. */ |
|||
return (unicode_attributes[ch].name != NULL |
|||
&& unicode_attributes[ch].category[0] == 'M' |
|||
&& (unicode_attributes[ch].category[1] == 'n' |
|||
|| unicode_attributes[ch].category[1] == 'c' |
|||
|| unicode_attributes[ch].category[1] == 'e')); |
|||
} |
|||
|
|||
static bool |
|||
is_combining_level3 (unsigned int ch) |
|||
{ |
|||
return is_combining (ch) |
|||
&& !(unicode_attributes[ch].combining[0] != '\0' |
|||
&& unicode_attributes[ch].combining[0] != '0' |
|||
&& strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200); |
|||
} |
|||
|
|||
/* Return the UCS symbol string for a Unicode character. */ |
|||
static const char * |
|||
ucs_symbol (unsigned int i) |
|||
{ |
|||
static char buf[11+1]; |
|||
|
|||
sprintf (buf, (i < 0x10000 ? "<U%04X>" : "<U%08X>"), i); |
|||
return buf; |
|||
} |
|||
|
|||
/* Return the UCS symbol range string for a Unicode characters interval. */ |
|||
static const char * |
|||
ucs_symbol_range (unsigned int low, unsigned int high) |
|||
{ |
|||
static char buf[24+1]; |
|||
|
|||
strcpy (buf, ucs_symbol (low)); |
|||
strcat (buf, ".."); |
|||
strcat (buf, ucs_symbol (high)); |
|||
return buf; |
|||
} |
|||
|
|||
/* Output a character class (= property) table. */ |
|||
|
|||
static void |
|||
output_charclass (FILE *stream, const char *classname, |
|||
bool (*func) (unsigned int)) |
|||
{ |
|||
char table[0x110000]; |
|||
unsigned int i; |
|||
bool need_semicolon; |
|||
const int max_column = 75; |
|||
int column; |
|||
|
|||
for (i = 0; i < 0x110000; i++) |
|||
table[i] = (int) func (i); |
|||
|
|||
fprintf (stream, "%s ", classname); |
|||
need_semicolon = false; |
|||
column = 1000; |
|||
for (i = 0; i < 0x110000; ) |
|||
{ |
|||
if (!table[i]) |
|||
i++; |
|||
else |
|||
{ |
|||
unsigned int low, high; |
|||
char buf[25]; |
|||
|
|||
low = i; |
|||
do |
|||
i++; |
|||
while (i < 0x110000 && table[i]); |
|||
high = i - 1; |
|||
|
|||
if (low == high) |
|||
strcpy (buf, ucs_symbol (low)); |
|||
else |
|||
strcpy (buf, ucs_symbol_range (low, high)); |
|||
|
|||
if (need_semicolon) |
|||
{ |
|||
fprintf (stream, ";"); |
|||
column++; |
|||
} |
|||
|
|||
if (column + strlen (buf) > max_column) |
|||
{ |
|||
fprintf (stream, "/\n "); |
|||
column = 3; |
|||
} |
|||
|
|||
fprintf (stream, "%s", buf); |
|||
column += strlen (buf); |
|||
need_semicolon = true; |
|||
} |
|||
} |
|||
fprintf (stream, "\n"); |
|||
} |
|||
|
|||
/* Output a character mapping table. */ |
|||
|
|||
static void |
|||
output_charmap (FILE *stream, const char *mapname, |
|||
unsigned int (*func) (unsigned int)) |
|||
{ |
|||
char table[0x110000]; |
|||
unsigned int i; |
|||
bool need_semicolon; |
|||
const int max_column = 75; |
|||
int column; |
|||
|
|||
for (i = 0; i < 0x110000; i++) |
|||
table[i] = (func (i) != i); |
|||
|
|||
fprintf (stream, "%s ", mapname); |
|||
need_semicolon = false; |
|||
column = 1000; |
|||
for (i = 0; i < 0x110000; i++) |
|||
if (table[i]) |
|||
{ |
|||
char buf[25+1]; |
|||
|
|||
strcpy (buf, "("); |
|||
strcat (buf, ucs_symbol (i)); |
|||
strcat (buf, ","); |
|||
strcat (buf, ucs_symbol (func (i))); |
|||
strcat (buf, ")"); |
|||
|
|||
if (need_semicolon) |
|||
{ |
|||
fprintf (stream, ";"); |
|||
column++; |
|||
} |
|||
|
|||
if (column + strlen (buf) > max_column) |
|||
{ |
|||
fprintf (stream, "/\n "); |
|||
column = 3; |
|||
} |
|||
|
|||
fprintf (stream, "%s", buf); |
|||
column += strlen (buf); |
|||
need_semicolon = true; |
|||
} |
|||
fprintf (stream, "\n"); |
|||
} |
|||
|
|||
/* Output the width table. */ |
|||
|
|||
static void |
|||
output_widthmap (FILE *stream) |
|||
{ |
|||
} |
|||
|
|||
/* Output the tables to the given file. */ |
|||
|
|||
static void |
|||
output_tables (const char *filename, const char *version) |
|||
{ |
|||
FILE *stream; |
|||
unsigned int ch; |
|||
|
|||
stream = fopen (filename, "w"); |
|||
if (stream == NULL) |
|||
{ |
|||
fprintf (stderr, "cannot open '%s' for writing\n", filename); |
|||
exit (1); |
|||
} |
|||
|
|||
fprintf (stream, "escape_char /\n"); |
|||
fprintf (stream, "comment_char %%\n"); |
|||
fprintf (stream, "\n"); |
|||
fprintf (stream, "%% Generated automatically by gen-unicode-ctype for Unicode %s.\n", |
|||
version); |
|||
fprintf (stream, "\n"); |
|||
|
|||
fprintf (stream, "LC_IDENTIFICATION\n"); |
|||
fprintf (stream, "title \"Unicode %s FDCC-set\"\n", version); |
|||
fprintf (stream, "source \"UnicodeData.txt, PropList.txt\"\n"); |
|||
fprintf (stream, "address \"\"\n"); |
|||
fprintf (stream, "contact \"\"\n"); |
|||
fprintf (stream, "email \"bug-glibc-locales@gnu.org\"\n"); |
|||
fprintf (stream, "tel \"\"\n"); |
|||
fprintf (stream, "fax \"\"\n"); |
|||
fprintf (stream, "language \"\"\n"); |
|||
fprintf (stream, "territory \"Earth\"\n"); |
|||
fprintf (stream, "revision \"%s\"\n", version); |
|||
{ |
|||
time_t now; |
|||
char date[11]; |
|||
now = time (NULL); |
|||
strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now)); |
|||
fprintf (stream, "date \"%s\"\n", date); |
|||
} |
|||
fprintf (stream, "category \"unicode:2001\";LC_CTYPE\n"); |
|||
fprintf (stream, "END LC_IDENTIFICATION\n"); |
|||
fprintf (stream, "\n"); |
|||
|
|||
/* Verifications. */ |
|||
for (ch = 0; ch < 0x110000; ch++) |
|||
{ |
|||
/* toupper restriction: "Only characters specified for the keywords
|
|||
lower and upper shall be specified. */ |
|||
if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch))) |
|||
fprintf (stderr, |
|||
"%s is not upper|lower but toupper(0x%04X) = 0x%04X\n", |
|||
ucs_symbol (ch), ch, to_upper (ch)); |
|||
|
|||
/* tolower restriction: "Only characters specified for the keywords
|
|||
lower and upper shall be specified. */ |
|||
if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch))) |
|||
fprintf (stderr, |
|||
"%s is not upper|lower but tolower(0x%04X) = 0x%04X\n", |
|||
ucs_symbol (ch), ch, to_lower (ch)); |
|||
|
|||
/* alpha restriction: "Characters classified as either upper or lower
|
|||
shall automatically belong to this class. */ |
|||
if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch)) |
|||
fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch)); |
|||
|
|||
/* alpha restriction: "No character specified for the keywords cntrl,
|
|||
digit, punct or space shall be specified." */ |
|||
if (is_alpha (ch) && is_cntrl (ch)) |
|||
fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch)); |
|||
if (is_alpha (ch) && is_digit (ch)) |
|||
fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch)); |
|||
if (is_alpha (ch) && is_punct (ch)) |
|||
fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch)); |
|||
if (is_alpha (ch) && is_space (ch)) |
|||
fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch)); |
|||
|
|||
/* space restriction: "No character specified for the keywords upper,
|
|||
lower, alpha, digit, graph or xdigit shall be specified." |
|||
upper, lower, alpha already checked above. */ |
|||
if (is_space (ch) && is_digit (ch)) |
|||
fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch)); |
|||
if (is_space (ch) && is_graph (ch)) |
|||
fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch)); |
|||
if (is_space (ch) && is_xdigit (ch)) |
|||
fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch)); |
|||
|
|||
/* cntrl restriction: "No character specified for the keywords upper,
|
|||
lower, alpha, digit, punct, graph, print or xdigit shall be |
|||
specified." upper, lower, alpha already checked above. */ |
|||
if (is_cntrl (ch) && is_digit (ch)) |
|||
fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch)); |
|||
if (is_cntrl (ch) && is_punct (ch)) |
|||
fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch)); |
|||
if (is_cntrl (ch) && is_graph (ch)) |
|||
fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch)); |
|||
if (is_cntrl (ch) && is_print (ch)) |
|||
fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch)); |
|||
if (is_cntrl (ch) && is_xdigit (ch)) |
|||
fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch)); |
|||
|
|||
/* punct restriction: "No character specified for the keywords upper,
|
|||
lower, alpha, digit, cntrl, xdigit or as the <space> character shall |
|||
be specified." upper, lower, alpha, cntrl already checked above. */ |
|||
if (is_punct (ch) && is_digit (ch)) |
|||
fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch)); |
|||
if (is_punct (ch) && is_xdigit (ch)) |
|||
fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch)); |
|||
if (is_punct (ch) && (ch == 0x0020)) |
|||
fprintf (stderr, "%s is punct\n", ucs_symbol (ch)); |
|||
|
|||
/* graph restriction: "No character specified for the keyword cntrl
|
|||
shall be specified." Already checked above. */ |
|||
|
|||
/* print restriction: "No character specified for the keyword cntrl
|
|||
shall be specified." Already checked above. */ |
|||
|
|||
/* graph - print relation: differ only in the <space> character.
|
|||
How is this possible if there are more than one space character?! |
|||
I think susv2/xbd/locale.html should speak of "space characters", |
|||
not "space character". */ |
|||
if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch))) |
|||
fprintf (stderr, |
|||
"%s is print but not graph|<space>\n", ucs_symbol (ch)); |
|||
if (!is_print (ch) && (is_graph (ch) || ch == 0x0020)) |
|||
fprintf (stderr, |
|||
"%s is graph|<space> but not print\n", ucs_symbol (ch)); |
|||
} |
|||
|
|||
fprintf (stream, "LC_CTYPE\n"); |
|||
output_charclass (stream, "upper", is_upper); |
|||
output_charclass (stream, "lower", is_lower); |
|||
output_charclass (stream, "alpha", is_alpha); |
|||
output_charclass (stream, "digit", is_digit); |
|||
output_charclass (stream, "outdigit", is_outdigit); |
|||
output_charclass (stream, "blank", is_blank); |
|||
output_charclass (stream, "space", is_space); |
|||
output_charclass (stream, "cntrl", is_cntrl); |
|||
output_charclass (stream, "punct", is_punct); |
|||
output_charclass (stream, "xdigit", is_xdigit); |
|||
output_charclass (stream, "graph", is_graph); |
|||
output_charclass (stream, "print", is_print); |
|||
output_charclass (stream, "class \"combining\";", is_combining); |
|||
output_charclass (stream, "class \"combining_level3\";", is_combining_level3); |
|||
output_charmap (stream, "toupper", to_upper); |
|||
output_charmap (stream, "tolower", to_lower); |
|||
output_charmap (stream, "map \"totitle\";", to_title); |
|||
output_widthmap (stream); |
|||
fprintf (stream, "END LC_CTYPE\n"); |
|||
|
|||
if (ferror (stream) || fclose (stream)) |
|||
{ |
|||
fprintf (stderr, "error writing to '%s'\n", filename); |
|||
exit (1); |
|||
} |
|||
} |
|||
|
|||
int |
|||
main (int argc, char * argv[]) |
|||
{ |
|||
if (argc != 3) |
|||
{ |
|||
fprintf (stderr, "Usage: %s UnicodeData.txt version\n", argv[0]); |
|||
exit (1); |
|||
} |
|||
|
|||
fill_attributes (argv[1]); |
|||
|
|||
output_tables ("unicode", argv[2]); |
|||
|
|||
return 0; |
|||
} |
|||
File diff suppressed because it is too large
File diff suppressed because it is too large
File diff suppressed because it is too large
@ -0,0 +1,99 @@ |
|||
# Copyright (C) 2015 Free Software Foundation, Inc.
|
|||
# This file is part of the GNU C Library.
|
|||
|
|||
# The GNU C Library is free software; you can redistribute it and/or
|
|||
# modify it under the terms of the GNU Lesser General Public
|
|||
# License as published by the Free Software Foundation; either
|
|||
# version 2.1 of the License, or (at your option) any later version.
|
|||
|
|||
# The GNU C Library is distributed in the hope that it will be useful,
|
|||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|||
# Lesser General Public License for more details.
|
|||
|
|||
# You should have received a copy of the GNU Lesser General Public
|
|||
# License along with the GNU C Library; if not, see
|
|||
# <http://www.gnu.org/licenses/>.
|
|||
|
|||
# Makefile for generating and updating Unicode-extracted files.
|
|||
|
|||
|
|||
# This Makefile is NOT used as part of the GNU libc build. It needs
|
|||
# to be run manually, within the source tree, at Unicode upgrades
|
|||
# (change UNICODE_VERSION below), to update ../locales/i18n ctype
|
|||
# information (part of the file is preserved, so don't wipe it all
|
|||
# out), and ../charmaps/UTF-8.
|
|||
|
|||
# Use make all to generate the files used in the glibc build out of
|
|||
# the original Unicode files; make check to verify that they are what
|
|||
# we expect; make install to copy them to the location expected by the
|
|||
# glibc build; and make clean to remove all generated files.
|
|||
|
|||
# We keep a local copy of the downloaded Unicode files, to avoid
|
|||
# running afoul of the LGPL corresponding sources requirements, even
|
|||
# though it's not clear that they are preferred over the generated
|
|||
# files for making modifications.
|
|||
|
|||
|
|||
UNICODE_VERSION = 7.0.0 |
|||
|
|||
PYTHON3 = python3 |
|||
WGET = wget |
|||
|
|||
DOWNLOADS = UnicodeData.txt DerivedCoreProperties.txt EastAsianWidth.txt |
|||
GENERATED = i18n UTF-8 |
|||
REPORTS = i18n-report UTF-8-report |
|||
|
|||
all: $(GENERATED) |
|||
|
|||
check: check-i18n check-UTF-8 |
|||
|
|||
install: |
|||
cp -p i18n ../locales/i18n |
|||
cp -p UTF-8 ../charmaps/UTF-8 |
|||
|
|||
clean: mostlyclean |
|||
-rm -rf __pycache__ |
|||
mostlyclean: |
|||
-rm -f $(REPORTS) $(GENERATED) |
|||
|
|||
.PHONY: all check clean mostlyclean install |
|||
|
|||
i18n: UnicodeData.txt DerivedCoreProperties.txt |
|||
i18n: ../locales/i18n # Preserve non-ctype information.
|
|||
i18n: gen_unicode_ctype.py |
|||
$(PYTHON3) gen_unicode_ctype.py -u UnicodeData.txt \
|
|||
-d DerivedCoreProperties.txt -i ../locales/i18n -o $@ \
|
|||
--unicode_version $(UNICODE_VERSION) |
|||
|
|||
i18n-report: i18n ../locales/i18n |
|||
i18n-report: ctype_compatibility.py ctype_compatibility_test_cases.py |
|||
$(PYTHON3) ./ctype_compatibility.py -o ../locales/i18n \
|
|||
-n i18n -a -m > $@ |
|||
|
|||
check-i18n: i18n-report |
|||
@if grep '\(Missing\|Added\) [^0]\|^Number of errors[^=]* = [^0]' \
|
|||
i18n-report; \
|
|||
then echo manual verification required; false; else true; fi |
|||
|
|||
UTF-8: UnicodeData.txt EastAsianWidth.txt |
|||
UTF-8: utf8_gen.py |
|||
$(PYTHON3) utf8_gen.py UnicodeData.txt EastAsianWidth.txt |
|||
|
|||
UTF-8-report: UTF-8 ../charmaps/UTF-8 |
|||
UTF-8-report: utf8_compatibility.py |
|||
$(PYTHON3) ./utf8_compatibility.py -o ../charmaps/UTF-8 \
|
|||
-n UTF-8 -a -m > $@ |
|||
|
|||
check-UTF-8: UTF-8-report |
|||
@if grep '^Total.*: [^0]' UTF-8-report; \
|
|||
then echo manual verification required; false; else true; fi |
|||
|
|||
|
|||
.PHONY: downloads clean-downloads |
|||
downloads: $(DOWNLOADS) |
|||
clean-downloads: |
|||
-rm -f $(DOWNLOADS) |
|||
|
|||
$(DOWNLOADS): |
|||
$(WGET) http://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$@ |
|||
File diff suppressed because it is too large
@ -0,0 +1,546 @@ |
|||
#!/usr/bin/python3 |
|||
# -*- coding: utf-8 -*- |
|||
# Copyright (C) 2014, 2015 Free Software Foundation, Inc. |
|||
# This file is part of the GNU C Library. |
|||
# |
|||
# The GNU C Library is free software; you can redistribute it and/or |
|||
# modify it under the terms of the GNU Lesser General Public |
|||
# License as published by the Free Software Foundation; either |
|||
# version 2.1 of the License, or (at your option) any later version. |
|||
# |
|||
# The GNU C Library is distributed in the hope that it will be useful, |
|||
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|||
# Lesser General Public License for more details. |
|||
# |
|||
# You should have received a copy of the GNU Lesser General Public |
|||
# License along with the GNU C Library; if not, see |
|||
# <http://www.gnu.org/licenses/>. |
|||
|
|||
''' |
|||
This script is useful for checking the differences between |
|||
an old LC_CTYPE file /usr/share/i18n/locale/i18n and a |
|||
new one generated by gen_unicode_ctype.py |
|||
|
|||
To see how it is used, call it with the “-h” option: |
|||
|
|||
$ ./ctype_compatibility.py -h |
|||
… prints usage message … |
|||
''' |
|||
|
|||
import sys |
|||
import re |
|||
import unicodedata |
|||
import argparse |
|||
|
|||
from ctype_compatibility_test_cases import TEST_CASES |
|||
|
|||
def get_lines_from_file(filename): |
|||
'''Get all non-comment lines from a i18n file |
|||
|
|||
Also merge all lines which are continued on the next line because |
|||
they end in “/” into a single line. |
|||
''' |
|||
with open(filename) as i18n_file: |
|||
current_line = '' |
|||
for line in i18n_file: |
|||
line = line.strip('\n') |
|||
if '%' in line: |
|||
if line.endswith('/'): |
|||
line = line[0:line.find('%')] + '/' |
|||
else: |
|||
line = line[0:line.find('%')] |
|||
line = line.strip() |
|||
if line.endswith('/'): |
|||
current_line += line[:-1] |
|||
else: |
|||
yield current_line + line |
|||
current_line = '' |
|||
if current_line: # file ends with a continuation line |
|||
yield current_line |
|||
|
|||
def extract_character_classes(filename): |
|||
'''Get all Unicode code points for each character class from a file |
|||
|
|||
Store these code points in a dictionary using the character classes |
|||
as keys and the list of code points in this character class as values. |
|||
|
|||
In case of the character classes “toupper”, “tolower”, and “totitle”, |
|||
these area actually pairs of code points |
|||
''' |
|||
ctype_dict = {} |
|||
for line in get_lines_from_file(filename): |
|||
for char_class in [ |
|||
'upper', |
|||
'lower', |
|||
'alpha', |
|||
'digit', |
|||
'outdigit', |
|||
'space', |
|||
'cntrl', |
|||
'punct', |
|||
'graph', |
|||
'print', |
|||
'xdigit', |
|||
'blank', |
|||
'combining', |
|||
'combining_level3', |
|||
'toupper', |
|||
'tolower', |
|||
'totitle']: |
|||
match = re.match(r'^(' |
|||
+'(?:(?:class|map)\s+")' |
|||
+re.escape(char_class)+ |
|||
'(?:";)\s+' |
|||
+'|' |
|||
+re.escape(char_class)+'\s+' |
|||
+')', line) |
|||
if match: |
|||
if char_class not in ctype_dict: |
|||
ctype_dict[char_class] = [] |
|||
process_chars( |
|||
ctype_dict[char_class], |
|||
line[match.end():]) |
|||
return ctype_dict |
|||
|
|||
def process_chars(char_class_list, code_point_line): |
|||
''' |
|||
Extract Unicode values from code_point_line |
|||
and add to the list of code points in a character class |
|||
''' |
|||
for code_points in code_point_line.split(';'): |
|||
code_points = code_points.strip() |
|||
match = re.match(r'^<U(?P<codepoint>[0-9A-F]{4,8})>$', code_points) |
|||
if match: # <Uxxxx> |
|||
char_class_list.append( |
|||
int(match.group('codepoint'), 16)) |
|||
continue |
|||
match = re.match( |
|||
r'^<U(?P<codepoint1>[0-9A-F]{4,8})>' |
|||
+'\.\.'+ |
|||
'<U(?P<codepoint2>[0-9A-F]{4,8})>$', |
|||
code_points) |
|||
if match: # <Uxxxx>..<Uxxxx> |
|||
for codepoint in range( |
|||
int(match.group('codepoint1'), 16), |
|||
int(match.group('codepoint2'), 16) + 1): |
|||
char_class_list.append(codepoint) |
|||
continue |
|||
match = re.match( |
|||
r'^<U(?P<codepoint1>[0-9A-F]{4,8})>' |
|||
+'\.\.\(2\)\.\.'+ |
|||
'<U(?P<codepoint2>[0-9A-F]{4,8})>$', |
|||
code_points) |
|||
if match: # <Uxxxx>..(2)..<Uxxxx> |
|||
for codepoint in range( |
|||
int(match.group('codepoint1'), 16), |
|||
int(match.group('codepoint2'), 16) + 1, |
|||
2): |
|||
char_class_list.append(codepoint) |
|||
continue |
|||
match = re.match( |
|||
r'^\(' |
|||
+'<U(?P<codepoint1>[0-9A-F]{4,8})>' |
|||
+','+ |
|||
'<U(?P<codepoint2>[0-9A-F]{4,8})>' |
|||
+'\)$', |
|||
code_points) |
|||
if match: # (<Uxxxx>,<Uxxxx>) |
|||
char_class_list.append(( |
|||
int(match.group('codepoint1'), 16), |
|||
int(match.group('codepoint2'), 16))) |
|||
continue |
|||
sys.stderr.write( |
|||
('None of the regexps matched ' |
|||
+ 'code_points=%(cp)s in code_point_line=%(cpl)s\n') %{ |
|||
'cp': code_points, |
|||
'cpl': code_point_line |
|||
}) |
|||
exit(1) |
|||
|
|||
def compare_lists(old_ctype_dict, new_ctype_dict): |
|||
'''Compare character classes in the old and the new LC_CTYPE''' |
|||
print('****************************************************') |
|||
print('Character classes which are only in the new ' |
|||
+ 'or only in the old file:') |
|||
for char_class in sorted(old_ctype_dict): |
|||
if char_class not in new_ctype_dict: |
|||
print('Character class %s is in old ctype but not in new ctype' |
|||
%char_class) |
|||
for char_class in sorted(new_ctype_dict): |
|||
if char_class not in old_ctype_dict: |
|||
print('Character class %s is in new ctype but not in old ctype' |
|||
%char_class) |
|||
for char_class in sorted(old_ctype_dict): |
|||
print("****************************************************") |
|||
print("%s: %d chars in old ctype and %d chars in new ctype" %( |
|||
char_class, |
|||
len(old_ctype_dict[char_class]), |
|||
len(new_ctype_dict[char_class]))) |
|||
print("----------------------------------------------------") |
|||
report(char_class, |
|||
old_ctype_dict[char_class], |
|||
new_ctype_dict[char_class]) |
|||
|
|||
def report_code_points(char_class, code_point_list, text=''): |
|||
'''Report all code points which have been added to or removed from a |
|||
character class. |
|||
''' |
|||
for code_point in sorted(code_point_list): |
|||
if type(code_point) == type(int()): |
|||
print('%(char_class)s: %(text)s: %(char)s %(code_point)s %(name)s' |
|||
%{'text': text, |
|||
'char': chr(code_point), |
|||
'char_class': char_class, |
|||
'code_point': hex(code_point), |
|||
'name': unicodedata.name(chr(code_point), 'name unknown')}) |
|||
else: |
|||
print(('%(char_class)s: %(text)s: ' |
|||
+ '%(char0)s → %(char1)s ' |
|||
+ '%(code_point0)s → %(code_point1)s ' |
|||
+ '%(name0)s → %(name1)s') %{ |
|||
'text': text, |
|||
'char_class': char_class, |
|||
'char0': chr(code_point[0]), |
|||
'code_point0': hex(code_point[0]), |
|||
'name0': unicodedata.name(chr(code_point[0]), 'name unknown'), |
|||
'char1': chr(code_point[1]), |
|||
'code_point1': hex(code_point[1]), |
|||
'name1': unicodedata.name(chr(code_point[1]), 'name unknown') |
|||
}) |
|||
|
|||
def report(char_class, old_list, new_list): |
|||
'''Report the differences for a certain LC_CTYPE character class |
|||
between the old and the newly generated state |
|||
''' |
|||
missing_chars = list(set(old_list)-set(new_list)) |
|||
print(('%(char_class)s: Missing %(number)d characters ' |
|||
+ 'of old ctype in new ctype ') |
|||
%{'char_class': char_class, 'number': len(missing_chars)}) |
|||
if ARGS.show_missing_characters: |
|||
report_code_points(char_class, missing_chars, 'Missing') |
|||
added_chars = list(set(new_list)-set(old_list)) |
|||
print(('%(char_class)s: Added %(number)d characters ' |
|||
+ 'in new ctype which were not in old ctype') |
|||
%{'char_class': char_class, 'number': len(added_chars)}) |
|||
if ARGS.show_added_characters: |
|||
report_code_points(char_class, added_chars, 'Added') |
|||
|
|||
|
|||
def cperror(error_message, errorcounter=0): |
|||
'''Increase number of errors by one and print an error message''' |
|||
print(error_message) |
|||
return errorcounter + 1 |
|||
|
|||
def cpcheck(ctype_dict, code_point_list_with_ranges, char_classes, reason='', |
|||
errorcounter=0): |
|||
'''The parameter “code_point_list_with_ranges” is a list of |
|||
integers or pairs of integers, for example: |
|||
|
|||
[0x0E31, (0x0E34, 0x0E3A), (0x0E47, 0x0E4E)] |
|||
|
|||
where the pairs of integers stand for all the code points in the range |
|||
of the two integers given, including the two integers of the pair. |
|||
|
|||
''' |
|||
for code_point_range in code_point_list_with_ranges: |
|||
for code_point in ([code_point_range] |
|||
if type(code_point_range) == type(int()) |
|||
else range(code_point_range[0], |
|||
code_point_range[1]+1)): |
|||
for char_class_tuple in char_classes: |
|||
char_class = char_class_tuple[0] |
|||
in_char_class = char_class_tuple[1] |
|||
if (code_point in ctype_dict[char_class]) != in_char_class: |
|||
errorcounter = cperror( |
|||
('error: %(code_point)s %(char)s ' |
|||
+ '%(char_class)s %(in)s: %(reason)s') %{ |
|||
'code_point': hex(code_point), |
|||
'char': chr(code_point), |
|||
'char_class': char_class, |
|||
'in': not in_char_class, |
|||
'reason': reason}, |
|||
errorcounter) |
|||
return errorcounter |
|||
|
|||
def tests(ctype_dict, errorcounter = 0): |
|||
'''Test a LC_CTYPE character class dictionary for known errors''' |
|||
# copy the information from ctype_dict (which contains lists) in |
|||
# a new dictionary ctype_dict2 (which contains dictionaries). |
|||
# The checks below are easier with that type of data structure. |
|||
|
|||
ctype_dict2 = {} |
|||
for key in ctype_dict: |
|||
ctype_dict2[key] = {} |
|||
if ctype_dict[key]: |
|||
if type(ctype_dict[key][0]) == type(int()): |
|||
for value in ctype_dict[key]: |
|||
ctype_dict2[key][value] = 1 |
|||
else: # key is 'toupper', 'tolower', or 'totitle' |
|||
for value in ctype_dict[key]: |
|||
ctype_dict2[key][value[0]] = value[1] |
|||
|
|||
for test_case in TEST_CASES: |
|||
errorcounter = cpcheck(ctype_dict2, |
|||
test_case[0], |
|||
test_case[1], |
|||
test_case[2], |
|||
errorcounter = errorcounter) |
|||
|
|||
for code_point in range(0, 0x110000): |
|||
# toupper restriction: "Only characters specified for the keywords |
|||
# lower and upper shall be specified. |
|||
if (code_point in ctype_dict2['toupper'] |
|||
and code_point != ctype_dict2['toupper'][code_point] |
|||
and not (code_point in ctype_dict2['lower'] |
|||
or code_point in ctype_dict2['upper'])): |
|||
errorcounter = cperror( |
|||
('error: %(char1)s is not upper|lower ' |
|||
+ 'but toupper(%(cp1)s)=%(cp2)s (%(char2)s)') %{ |
|||
'char1': chr(code_point), |
|||
'cp1': hex(code_point), |
|||
'cp2': hex(ctype_dict2['toupper'][code_point]), |
|||
'char2': chr(ctype_dict2['toupper'][code_point]) |
|||
}, |
|||
errorcounter) |
|||
# tolower restriction: "Only characters specified for the keywords |
|||
# lower and upper shall be specified. |
|||
if (code_point in ctype_dict2['tolower'] |
|||
and code_point != ctype_dict2['tolower'][code_point] |
|||
and not (code_point in ctype_dict2['lower'] |
|||
or code_point in ctype_dict2['upper'])): |
|||
errorcounter = cperror( |
|||
('error: %(char1)s is not upper|lower ' |
|||
+ 'but tolower(%(cp1)s)=%(cp2)s (%(char2)s)') %{ |
|||
'char1': chr(code_point), |
|||
'cp1': hex(code_point), |
|||
'cp2': hex(ctype_dict2['tolower'][code_point]), |
|||
'char2': chr(ctype_dict2['tolower'][code_point]) |
|||
}, |
|||
errorcounter) |
|||
# alpha restriction: "Characters classified as either upper or lower |
|||
# shall automatically belong to this class. |
|||
if ((code_point in ctype_dict2['lower'] |
|||
or code_point in ctype_dict2['upper']) |
|||
and code_point not in ctype_dict2['alpha']): |
|||
errorcounter = cperror( |
|||
'error: %(char)s %(cp)s is upper|lower but not alpha' %{ |
|||
'char': chr(code_point), |
|||
'cp': hex(code_point) |
|||
}, |
|||
errorcounter) |
|||
# alpha restriction: "No character specified for the keywords cntrl, |
|||
# digit, punct or space shall be specified." |
|||
if (code_point in ctype_dict2['alpha'] |
|||
and code_point in ctype_dict2['cntrl']): |
|||
errorcounter = cperror( |
|||
'error: %(char)s %(cp)s is alpha and cntrl' %{ |
|||
'char': chr(code_point), |
|||
'cp': hex(code_point) |
|||
}, |
|||
errorcounter) |
|||
if (code_point in ctype_dict2['alpha'] |
|||
and code_point in ctype_dict2['digit']): |
|||
errorcounter = cperror( |
|||
'error: %(char)s %(cp)s is alpha and digit' %{ |
|||
'char': chr(code_point), |
|||
'cp': hex(code_point) |
|||
}, |
|||
errorcounter) |
|||
if (code_point in ctype_dict2['alpha'] |
|||
and code_point in ctype_dict2['punct']): |
|||
errorcounter = cperror( |
|||
'error: %(char)s %(cp)s is alpha and punct' %{ |
|||
'char': chr(code_point), |
|||
'cp': hex(code_point) |
|||
}, |
|||
errorcounter) |
|||
if (code_point in ctype_dict2['alpha'] |
|||
and code_point in ctype_dict2['space']): |
|||
errorcounter = cperror( |
|||
'error: %(char)s %(cp)s is alpha and space' %{ |
|||
'char': chr(code_point), |
|||
'cp': hex(code_point) |
|||
}, |
|||
errorcounter) |
|||
# space restriction: "No character specified for the keywords upper, |
|||
# lower, alpha, digit, graph or xdigit shall be specified." |
|||
# upper, lower, alpha already checked above. |
|||
if (code_point in ctype_dict2['space'] |
|||
and code_point in ctype_dict2['digit']): |
|||
errorcounter = cperror( |
|||
'error: %(char)s %(cp)s is space and digit' %{ |
|||
'char': chr(code_point), |
|||
'cp': hex(code_point) |
|||
}, |
|||
errorcounter) |
|||
if (code_point in ctype_dict2['space'] |
|||
and code_point in ctype_dict2['graph']): |
|||
errorcounter = cperror( |
|||
'error: %(char)s %(cp)s is space and graph' %{ |
|||
'char': chr(code_point), |
|||
'cp': hex(code_point) |
|||
}, |
|||
errorcounter) |
|||
if (code_point in ctype_dict2['space'] |
|||
and code_point in ctype_dict2['xdigit']): |
|||
errorcounter = cperror( |
|||
'error: %(char)s %(cp)s is space and xdigit' %{ |
|||
'char': chr(code_point), |
|||
'cp': hex(code_point) |
|||
}, |
|||
errorcounter) |
|||
# cntrl restriction: "No character specified for the keywords upper, |
|||
# lower, alpha, digit, punct, graph, print or xdigit shall be |
|||
# specified." upper, lower, alpha already checked above. |
|||
if (code_point in ctype_dict2['cntrl'] |
|||
and code_point in ctype_dict2['digit']): |
|||
errorcounter = cperror( |
|||
'error: %(char)s %(cp)s is cntrl and digit' %{ |
|||
'char': chr(code_point), |
|||
'cp': hex(code_point) |
|||
}, |
|||
errorcounter) |
|||
if (code_point in ctype_dict2['cntrl'] |
|||
and code_point in ctype_dict2['punct']): |
|||
errorcounter = cperror( |
|||
'error: %(char)s %(cp)s is cntrl and punct' %{ |
|||
'char': chr(code_point), |
|||
'cp': hex(code_point) |
|||
}, |
|||
errorcounter) |
|||
if (code_point in ctype_dict2['cntrl'] |
|||
and code_point in ctype_dict2['graph']): |
|||
errorcounter = cperror( |
|||
'error: %(char)s %(cp)s is cntrl and graph' %{ |
|||
'char': chr(code_point), |
|||
'cp': hex(code_point) |
|||
}, |
|||
errorcounter) |
|||
if (code_point in ctype_dict2['cntrl'] |
|||
and code_point in ctype_dict2['print']): |
|||
errorcounter = cperror( |
|||
'error: %(char)s %(cp)s is cntrl and print' %{ |
|||
'char': chr(code_point), |
|||
'cp': hex(code_point) |
|||
}, |
|||
errorcounter) |
|||
if (code_point in ctype_dict2['cntrl'] |
|||
and code_point in ctype_dict2['xdigit']): |
|||
errorcounter = cperror( |
|||
'error: %(char)s %(cp)s is cntrl and xdigit' %{ |
|||
'char': chr(code_point), |
|||
'cp': hex(code_point) |
|||
}, |
|||
errorcounter) |
|||
# punct restriction: "No character specified for the keywords upper, |
|||
# lower, alpha, digit, cntrl, xdigit or as the <space> character shall |
|||
# be specified." upper, lower, alpha, cntrl already checked above. |
|||
if (code_point in ctype_dict2['punct'] |
|||
and code_point in ctype_dict2['digit']): |
|||
errorcounter = cperror( |
|||
'error: %(char)s %(cp)s is punct and digit' %{ |
|||
'char': chr(code_point), |
|||
'cp': hex(code_point) |
|||
}, |
|||
errorcounter) |
|||
if (code_point in ctype_dict2['punct'] |
|||
and code_point in ctype_dict2['xdigit']): |
|||
errorcounter = cperror( |
|||
'error: %(char)s %(cp)s is punct and xdigit' %{ |
|||
'char': chr(code_point), |
|||
'cp': hex(code_point) |
|||
}, |
|||
errorcounter) |
|||
if (code_point in ctype_dict2['punct'] |
|||
and code_point == 0x0020): |
|||
errorcounter = cperror( |
|||
'error: %(char)s %(cp)s is punct.' %{ |
|||
'char': chr(code_point), |
|||
'cp': hex(code_point) |
|||
}, |
|||
errorcounter) |
|||
# graph restriction: "No character specified for the keyword cntrl |
|||
# shall be specified." Already checked above. |
|||
|
|||
# print restriction: "No character specified for the keyword cntrl |
|||
# shall be specified." Already checked above. |
|||
|
|||
# graph - print relation: differ only in the <space> character. |
|||
# How is this possible if there are more than one space character?! |
|||
# I think susv2/xbd/locale.html should speak of "space characters", |
|||
# not "space character". |
|||
if (code_point in ctype_dict2['print'] |
|||
and not (code_point in ctype_dict2['graph'] |
|||
or code_point in ctype_dict2['space'])): |
|||
errorcounter = cperror( |
|||
'error: %(char)s %(cp)s is print but not graph|space' %{ |
|||
'char': chr(code_point), |
|||
'cp': hex(code_point) |
|||
}, |
|||
errorcounter) |
|||
if (code_point not in ctype_dict2['print'] |
|||
and (code_point in ctype_dict2['graph'] |
|||
or code_point == 0x0020)): |
|||
errorcounter = cperror( |
|||
'error: %(char)s %(cp)s graph|space but not print' %{ |
|||
'char': chr(code_point), |
|||
'cp': hex(code_point) |
|||
}, |
|||
errorcounter) |
|||
return errorcounter |
|||
|
|||
if __name__ == "__main__": |
|||
PARSER = argparse.ArgumentParser( |
|||
description=''' |
|||
Compare the contents of LC_CTYPE in two files and check for errors. |
|||
''') |
|||
PARSER.add_argument( |
|||
'-o', '--old_ctype_file', |
|||
nargs='?', |
|||
type=str, |
|||
default='i18n', |
|||
help='The old ctype file, default: %(default)s') |
|||
PARSER.add_argument( |
|||
'-n', '--new_ctype_file', |
|||
nargs='?', |
|||
type=str, |
|||
default='unicode-ctype', |
|||
help='The new ctype file, default: %(default)s') |
|||
PARSER.add_argument( |
|||
'-a', '--show_added_characters', |
|||
action='store_true', |
|||
help=('Show characters which were added to each ' |
|||
+ 'character class in detail.')) |
|||
PARSER.add_argument( |
|||
'-m', '--show_missing_characters', |
|||
action='store_true', |
|||
help=('Show characters which were removed from each ' |
|||
+ 'character class in detail.')) |
|||
ARGS = PARSER.parse_args() |
|||
|
|||
OLD_CTYPE_DICT = extract_character_classes( |
|||
ARGS.old_ctype_file) |
|||
NEW_CTYPE_DICT = extract_character_classes( |
|||
ARGS.new_ctype_file) |
|||
compare_lists(OLD_CTYPE_DICT, NEW_CTYPE_DICT) |
|||
print('============================================================') |
|||
print('Checking for errors in old ctype file: %s' %ARGS.old_ctype_file) |
|||
print('------------------------------------------------------------') |
|||
NUMBER_OF_ERRORS_IN_OLD_FILE = tests(OLD_CTYPE_DICT, errorcounter = 0) |
|||
print('------------------------------------------------------------') |
|||
print('Old file = %s' %ARGS.old_ctype_file) |
|||
print('Number of errors in old file = %s' %NUMBER_OF_ERRORS_IN_OLD_FILE) |
|||
print('------------------------------------------------------------') |
|||
print('============================================================') |
|||
print('Checking for errors in new ctype file: %s' %ARGS.new_ctype_file) |
|||
print('------------------------------------------------------------') |
|||
NUMBER_OF_ERRORS_IN_NEW_FILE = tests(NEW_CTYPE_DICT, errorcounter = 0) |
|||
print('------------------------------------------------------------') |
|||
print('New file = %s' %ARGS.new_ctype_file) |
|||
print('Number of errors in new file = %s' %NUMBER_OF_ERRORS_IN_NEW_FILE) |
|||
print('------------------------------------------------------------') |
|||
if NUMBER_OF_ERRORS_IN_NEW_FILE > 0: |
|||
exit(1) |
|||
else: |
|||
exit(0) |
|||
@ -0,0 +1,951 @@ |
|||
# -*- coding: utf-8 -*- |
|||
# Copyright (C) 2014, 2015 Free Software Foundation, Inc. |
|||
# This file is part of the GNU C Library. |
|||
# |
|||
# The GNU C Library is free software; you can redistribute it and/or |
|||
# modify it under the terms of the GNU Lesser General Public |
|||
# License as published by the Free Software Foundation; either |
|||
# version 2.1 of the License, or (at your option) any later version. |
|||
# |
|||
# The GNU C Library is distributed in the hope that it will be useful, |
|||
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|||
# Lesser General Public License for more details. |
|||
# |
|||
# You should have received a copy of the GNU Lesser General Public |
|||
# License along with the GNU C Library; if not, see |
|||
# <http://www.gnu.org/licenses/>. |
|||
|
|||
''' |
|||
This file contains a list of test cases used by |
|||
the ctype_compatibility.py script. |
|||
''' |
|||
|
|||
TEST_CASES = [ |
|||
[[0x0E2F, 0x0E46], [('alpha', True), ('punct', False)], |
|||
'''Theppitak Karoonboonyanan <thep@links.nectec.or.th> says |
|||
<U0E2F>, <U0E46> should belong to punct. DerivedCoreProperties.txt |
|||
says it is alpha. We trust DerivedCoreProperties.txt.''' |
|||
], |
|||
[[0x0E31, (0x0E34, 0x0E3A)], [('alpha', True)], |
|||
'''gen-unicode-ctype.c: Theppitak Karoonboonyanan |
|||
<thep@links.nectec.or.th> says <U0E31>, <U0E34>..<U0E3A> |
|||
are alpha. DerivedCoreProperties.txt agrees.''' |
|||
], |
|||
[[(0x0E47, 0x0E4C), 0x0E4E], [('alpha', False)], |
|||
'''gen-unicode-ctype.c: Theppitak Karoonboonyanan |
|||
<thep@links.nectec.or.th> says <U0E47>..<U0E4E> are |
|||
is_alpha. DerivedCoreProperties does says *only* <U0E4D> |
|||
in that range is alphabetic, the others are *not*. We |
|||
trust DerivedCoreProperties.txt.''' |
|||
], |
|||
[[0x0E4D], [('alpha', True)], |
|||
'''gen-unicode-ctype.c: Theppitak Karoonboonyanan |
|||
<thep@links.nectec.or.th> says <U0E47>..<U0E4E> are |
|||
is_alpha. DerivedCoreProperties does says *only* <U0E4D> |
|||
in that range is alphabetic, the others are *not*. We |
|||
trust DerivedCoreProperties.txt. |
|||
''' |
|||
], |
|||
[[0x0345], [('alpha', True), ('lower', True)], |
|||
'''COMBINING GREEK YPOGEGRAMMENI |
|||
According to DerivedCoreProperties.txt, this is “Alphabetic” |
|||
and “Lowercase”.''' |
|||
], |
|||
[[(0x2160, 0x2188)], [('alpha', True)], |
|||
'''Roman Numerals are “Alphabetic” according to |
|||
DerivedCoreProperties.txt''' |
|||
], |
|||
[[(0x24B6, 0x24E9)], [('alpha', True)], |
|||
'''Circled Latin letters are “Alphabetic” according to |
|||
DerivedCoreProperties.txt''' |
|||
], |
|||
[[0x661], [('alpha', True), ('digit', False)], |
|||
'''gen-unicode-ctype.c: All non-ASCII digits should be alphabetic. |
|||
ISO C 99 forbids us to have them in category "digit", but we |
|||
want iswalnum to return true on them. Don’t forget to |
|||
have a look at all the other digits, 0x661 is just one |
|||
example tested here.''' |
|||
], |
|||
[[(0x0030, 0x0039)], [('digit', True)], |
|||
'''gen-unicode-ctype.c: All ASCII digits should be digits.''' |
|||
], |
|||
[[0x0009], [('blank', True)], |
|||
'''gen-unicode-ctype.c: CHARACTER TABULATION''' |
|||
], |
|||
[[0x2007], [('blank', False), ('space', False)], |
|||
'''gen-unicode-ctype.c: FIGURE SPACE, because it has <noBreak> |
|||
in the description.''' |
|||
], |
|||
[[0x0009, 0x000A, 0x000B, 0x000C, 0x000D], [('space', True)], |
|||
'''gen-unicode-ctype.c: CHARACTER TABULATION, LINE FEED (LF), LINE |
|||
TABULATION, ;FORM FEED (FF), CARRIAGE RETURN (CR)''' |
|||
], |
|||
[[0x2028, 0x2029], [('cntrl', True)], |
|||
'''gen-unicode-ctype.c: LINE SEPARATOR and PARAGRAPH SEPARATOR |
|||
should be cntrl.''' |
|||
], |
|||
[[(0x0030, 0x0039), (0x0041, 0x0046), (0x0061, 0x0066)], |
|||
[('xdigit', True)], |
|||
'''gen-unicode-ctype.c: ISO C 99 says (6.4.4.1): hexadecimal-digit: |
|||
one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F (nothing else |
|||
should be considered as a hexadecimal-digit)''' |
|||
], |
|||
[[0x0330], [('combining', True), ('combining_level3', False)], |
|||
'''gen-unicode-ctype.c: COMBINING TILDE BELOW, canonical combining |
|||
class value >= 200, should be in combining but not in |
|||
combining_level3''' |
|||
], |
|||
[[0x0250, 0x0251, 0x0271], [('lower', True)], |
|||
'''Should be lower in Unicode 7.0.0 (was not lower in |
|||
Unicode 5.0.0). |
|||
''' |
|||
], |
|||
[[0x2184], [('lower', True)], |
|||
'''Should be lower both in Unicode 5.0.0 and 7.0.0''' |
|||
], |
|||
[[0xA67F], [('punct', False), ('alpha', True)], |
|||
'''0xa67f CYRILLIC PAYEROK. Not in Unicode 5.0.0. In Unicode |
|||
7.0.0. General category Lm (Letter |
|||
modifier). DerivedCoreProperties.txt says it is |
|||
“Alphabetic”. Apparently added manually to punct by mistake in |
|||
glibc’s old LC_CTYPE.''' |
|||
], |
|||
[[0xA60C], [('punct', False), ('alpha', True)], |
|||
'''0xa60c VAI SYLLABLE LENGTHENER. Not in Unicode 5.0.0. |
|||
In Unicode 7.0.0. General category Lm (Letter |
|||
modifier). DerivedCoreProperties.txt says it is |
|||
“Alphabetic”. Apparently added manually to punct by mistake in |
|||
glibc’s old LC_CTYPE.''' |
|||
], |
|||
[[0x2E2F], [('punct', False), ('alpha', True)], |
|||
'''0x2E2F VERTICAL TILDE. Not in Unicode 5.0.0. In Unicode |
|||
7.0.0. General category Lm (Letter |
|||
modifier). DerivedCoreProperties.txt says it is |
|||
“Alphabetic”. Apparently added manually to punct by mistake in |
|||
glibc’s old LC_CTYPE.''' |
|||
], |
|||
[[(0x1090, 0x1099)], [('punct', False), ('alpha', True)], |
|||
'''MYANMAR SHAN DIGIT ZERO - MYANMAR SHAN DIGIT NINE. |
|||
These are digits, but because ISO C 99 forbids to |
|||
put them into digit they should go into alpha.''' |
|||
], |
|||
[[0x103F], [('punct', False), ('alpha', True)], |
|||
'''0x103F MYANMAR LETTER GREAT SA. Not in Unicode 5.0.0. |
|||
In Unicode 7.0.0. General category Lo |
|||
(Other_Letter). DerivedCoreProperties.txt says it is |
|||
“Alphabetic”. Apparently added manually to punct by |
|||
mistake in glibc’s old LC_CTYPE.''' |
|||
], |
|||
[[0x0374], [('punct', False), ('alpha', True)], |
|||
'''0x0374 GREEK NUMERAL SIGN. Unicode 5.0.0: general category |
|||
Sk. Unicode 7.0.0: General category Lm |
|||
(Modifier_Letter). DerivedCoreProperties.txt says it is |
|||
“Alphabetic”.''' |
|||
], |
|||
[[0x02EC], [('punct', False), ('alpha', True)], |
|||
'''0x02EC MODIFIER LETTER VOICING. Unicode 5.0.0: general category |
|||
Sk. Unicode 7.0.0: General category Lm |
|||
(Modifier_Letter). DerivedCoreProperties.txt says it is |
|||
“Alphabetic”.''' |
|||
], |
|||
[[0x180E], [('space', False), ('blank', False)], |
|||
'''0x180e MONGOLIAN VOWEL SEPARATOR. Unicode 5.0.0: General |
|||
category Zs (Space_Separator) Unicode 7.0.0: General category Cf |
|||
(Format).''' |
|||
], |
|||
[[0x1E9C, 0x1E9D, 0x1E9F], |
|||
[('lower', True), ('upper', False), ('tolower', False), |
|||
('toupper', False), ('totitle', False)], |
|||
'''ẜ 0x1e9c LATIN SMALL LETTER LONG S WITH DIAGONAL STROKE, |
|||
ẝ 0x1e9d LATIN SMALL LETTER LONG S WITH HIGH STROKE, |
|||
ẟ 0x1e9f LATIN SMALL LETTER DELTA. These are “Lowercase” |
|||
according to DerivedCoreProperties.txt but no upper case versions |
|||
exist.''' |
|||
], |
|||
[[0x1E9E], |
|||
[('lower', False), ('upper', True), ('tolower', True), |
|||
('toupper', False), ('totitle', False)], |
|||
'''0x1E9E ẞ LATIN CAPITAL LETTER SHARP S This is “Uppercase” |
|||
according to DerivedCoreProperties.txt and the lower case |
|||
version is 0x00DF ß LATIN SMALL LETTER SHARP S.''' |
|||
], |
|||
[[0x2188], |
|||
[('alpha', True), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
'''0x2188 ROMAN NUMERAL ONE HUNDRED THOUSAND. This is “Alphabetic” |
|||
according to DerivedCoreProperties.txt. In glibc’s old |
|||
LC_CTYPE, it was in “lower”, which seems to be a |
|||
mistake. It is not “Lowercase” in |
|||
DerivedCoreProperties.txt and does not have case mappings |
|||
in UnicodeData.txt either.''' |
|||
], |
|||
[[0x2C71, 0x2C74, (0x2C77, 0x2C7A)], |
|||
[('alpha', True), ('lower', True), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
'''These are Latin small letters which were not in Unicode 5.0.0 |
|||
but are in Unicode 7.0.0. According to |
|||
DerivedCoreProperties.txt they are “Lowercase”. But no |
|||
uppercase versions exist. They have apparently been added |
|||
manually to glibc’s old LC_CTYPE.''' |
|||
], |
|||
[[0xA730, 0xA731], |
|||
[('alpha', True), ('lower', True), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
'''These are Latin small “capital” letters which were not in |
|||
Unicode 5.0.0 but are in Unicode 7.0.0. According to |
|||
DerivedCoreProperties.txt they are “Lowercase”. But no |
|||
uppercase versions exist. They have apparently been added |
|||
manually to glibc’s old LC_CTYPE.''' |
|||
], |
|||
[[(0xA771, 0xA778)], |
|||
[('alpha', True), ('lower', True), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
'''These are Latin small (or small “capital”) letters which |
|||
were not in Unicodee 5.0.0 but are in Unicode 7.0.0. According to |
|||
DerivedCoreProperties.txt they are “Lowercase”. But no |
|||
uppercase versions exist. They have apparently been added |
|||
manually to glibc’s old LC_CTYPE.''' |
|||
], |
|||
[[0x0375], |
|||
[('combining', False), ('combining_level3', False), |
|||
('alpha', False), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
'''“0375;GREEK LOWER NUMERAL SIGN;Sk;0;ON;;;;;N;;;;;”. Has |
|||
apparently been added manually to glibc’s old LC_CTYPE as |
|||
“combining_level3”. That seems wrong, it is no combining |
|||
character because it does not have one of the general |
|||
categories Mn, Mc, or Me. According to |
|||
DerivedCoreProperties.txt it is not “Alphabetic”.''' |
|||
], |
|||
[[0x108D], |
|||
[('combining', True), ('combining_level3', False), |
|||
('alpha', False), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
'''“108D;MYANMAR SIGN SHAN COUNCIL EMPHATIC |
|||
TONE;Mn;220;NSM;;;;;N;;;;;”. Has apparently been added |
|||
manually to glibc’s old LC_CTYPE as |
|||
“combining_level3”. That seems wrong, although it is a |
|||
combining character because it has the general category |
|||
Mn, it is not “combining_level3” because the canonical |
|||
combining class value is 220 which is >= 200. According to |
|||
gen-unicode-ctype.c, “combining_level3” needs a |
|||
canonical combining class value < 200. According to |
|||
DerivedCoreProperties.txt it is not “Alphabetic”.''' |
|||
], |
|||
[[0x06DE], |
|||
[('combining', False), ('combining_level3', False), |
|||
('alpha', False), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' UnicodeData.txt 5.0.0: “06DE;ARABIC START OF RUB EL |
|||
HIZB;Me;0;NSM;;;;;N;;;;;”; UnicodeData.txt 7.0.0: |
|||
“06DE;ARABIC START OF RUB EL |
|||
HIZB;So;0;ON;;;;;N;;;;;”. I.e. this used to be a |
|||
combining character in Unicode 5.0.0 but not anymore in |
|||
7.0.0. According to DerivedCoreProperties.txt it is not |
|||
“Alphabetic”.''' |
|||
], |
|||
[[0x0BD0], |
|||
[('combining', False), ('combining_level3', False), |
|||
('alpha', True), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
'''Not in UnicodeData.txt 5.0.0. UnicodeData.txt 7.0.0: |
|||
“0BD0;TAMIL OM;Lo;0;L;;;;;N;;;;;”. Apparently manually added to |
|||
“combining” and “combining_level3” in glibc’s old |
|||
LC_CTYPE. That seems wrong. According to |
|||
DerivedCoreProperties.txt it is “Alphabetic”.''' |
|||
], |
|||
[[0x103F], |
|||
[('combining', False), ('combining_level3', False), |
|||
('alpha', True), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
'''Not in UnicodeData.txt 5.0.0. UnicodeData.txt 7.0.0: |
|||
“103F;MYANMAR LETTER GREAT SA;Lo;0;L;;;;;N;;;;;”. |
|||
Apparently manually added to “combining” and |
|||
“combining_level3” in glibc’s old LC_CTYPE. That seems |
|||
wrong. According to DerivedCoreProperties.txt it is |
|||
“Alphabetic”.''' |
|||
], |
|||
[[(0x0901, 0x0903)], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', True), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
'''These have general category “Mn” i.e. these are combining |
|||
characters (both in UnicodeData.txt 5.0.0 and 7.0.0): |
|||
“0901;DEVANAGARI SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;;”, |
|||
”0902;DEVANAGARI SIGN ANUSVARA;Mn;0;NSM;;;;;N;;;;;”, |
|||
“0903;DEVANAGARI SIGN VISARGA;Mc;0;L;;;;;N;;;;;”. |
|||
According to DerivedCoreProperties.txt (7.0.0) these are |
|||
“Alphabetic”.''' |
|||
], |
|||
[[0x093C], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', False), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
'''UnicodeData.txt (5.0.0 and 7.0.0): “093C;DEVANAGARI SIGN |
|||
NUKTA;Mn;7;NSM;;;;;N;;;;;” According to |
|||
DerivedCoreProperties.txt (7.0.0) this is *not* |
|||
“Alphabetic”. glibc’s old LC_TYPE has this in “alpha”.''' |
|||
], |
|||
[[(0x093E, 0x093F)], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', True), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
'''These have general category “Mc” i.e. these are combining |
|||
characters (both in UnicodeData.txt 5.0.0 and 7.0.0): |
|||
“093E;DEVANAGARI VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;” |
|||
“093F;DEVANAGARI VOWEL SIGN I;Mc;0;L;;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) these are |
|||
“Alphabetic”.''' |
|||
], |
|||
[[(0x0940, 0x094C)], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', True), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
'''These are all combining |
|||
characters (“Mc” or “Mn” both in UnicodeData.txt 5.0.0 and 7.0.0). |
|||
According to DerivedCoreProperties.txt (7.0.0) these are |
|||
“Alphabetic”.''' |
|||
], |
|||
[[0x094D], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', False), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
'''Combining character, both in UnicodeData.txt 5.0.0 and 7.0.0. |
|||
“094D;DEVANAGARI SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) it is *not* |
|||
“Alphabetic”.''' |
|||
], |
|||
[[(0x0951, 0x0954)], |
|||
[('combining', True), ('combining_level3', False), |
|||
('alpha', False), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
'''Combining characters, both in UnicodeData.txt 5.0.0 and 7.0.0. |
|||
According to DerivedCoreProperties.txt (7.0.0) these are *not* |
|||
“Alphabetic”.''' |
|||
], |
|||
[[(0x0962, 0x0963), (0x0981, 0x0983)], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', True), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
'''Combining characters, both in UnicodeData.txt 5.0.0 and 7.0.0. |
|||
According to DerivedCoreProperties.txt (7.0.0) these are |
|||
“Alphabetic”.''' |
|||
], |
|||
[[0x09BC], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', False), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“09BC;BENGALI SIGN NUKTA;Mn;7;NSM;;;;;N;;;;;” |
|||
Combining character, both in UnicodeData.txt 5.0.0 and 7.0.0. |
|||
According to DerivedCoreProperties.txt (7.0.0) it is *not* |
|||
“Alphabetic”.''' |
|||
], |
|||
[[(0x09BE, 0x09BF), (0x09C0, 0x09C4), (0x09C7, 0x09C8), |
|||
(0x09CB, 0x09CC)], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', True), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“09BE;BENGALI VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;” |
|||
“09BF;BENGALI VOWEL SIGN I;Mc;0;L;;;;;N;;;;;” |
|||
“09C0;BENGALI VOWEL SIGN II;Mc;0;L;;;;;N;;;;;” |
|||
“09C1;BENGALI VOWEL SIGN U;Mn;0;NSM;;;;;N;;;;;” |
|||
“09C2;BENGALI VOWEL SIGN UU;Mn;0;NSM;;;;;N;;;;;” |
|||
“09C3;BENGALI VOWEL SIGN VOCALIC R;Mn;0;NSM;;;;;N;;;;;” |
|||
“09C4;BENGALI VOWEL SIGN VOCALIC RR;Mn;0;NSM;;;;;N;;;;;” |
|||
“09C7;BENGALI VOWEL SIGN E;Mc;0;L;;;;;N;;;;;” |
|||
“09C8;BENGALI VOWEL SIGN AI;Mc;0;L;;;;;N;;;;;” |
|||
“09CB;BENGALI VOWEL SIGN O;Mc;0;L;09C7 09BE;;;;N;;;;;” |
|||
“09CC;BENGALI VOWEL SIGN AU;Mc;0;L;09C7 09D7;;;;N;;;;;” |
|||
Combining characters, both in UnicodeData.txt 5.0.0 and 7.0.0. |
|||
According to DerivedCoreProperties.txt (7.0.0) these are |
|||
“Alphabetic”.''' |
|||
], |
|||
[[0x09CD], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', False), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“09CD;BENGALI SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;” |
|||
Combining character, both in UnicodeData.txt 5.0.0 and 7.0.0. |
|||
According to DerivedCoreProperties.txt (7.0.0) it is *not* |
|||
“Alphabetic”.''' |
|||
], |
|||
[[0x09D7, (0x09E2, 0x09E3)], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', True), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
'''Combining characters, both in UnicodeData.txt 5.0.0 and 7.0.0. |
|||
According to DerivedCoreProperties.txt (7.0.0) these are |
|||
“Alphabetic”.''' |
|||
], |
|||
[[0x09F2, 0x09F3], |
|||
[('combining', False), ('combining_level3', False), |
|||
('alpha', False), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“09F2;BENGALI RUPEE MARK;Sc;0;ET;;;;;N;;;;;” |
|||
“09F3;BENGALI RUPEE SIGN;Sc;0;ET;;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) these are *not* |
|||
“Alphabetic”.''' |
|||
], |
|||
[[(0x09F4, 0x09FA)], |
|||
[('combining', False), ('combining_level3', False), |
|||
('alpha', False), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“09F4;BENGALI CURRENCY NUMERATOR ONE;No;0;L;;;;1/16;N;;;;;” |
|||
“09F5;BENGALI CURRENCY NUMERATOR TWO;No;0;L;;;;1/8;N;;;;;” |
|||
“09F6;BENGALI CURRENCY NUMERATOR THREE;No;0;L;;;;3/16;N;;;;;” |
|||
“09F7;BENGALI CURRENCY NUMERATOR FOUR;No;0;L;;;;1/4;N;;;;;” |
|||
“09F8;BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR; |
|||
No;0;L;;;;3/4;N;;;;;” |
|||
“09F9;BENGALI CURRENCY DENOMINATOR SIXTEEN;No;0;L;;;;16;N;;;;;” |
|||
“09FA;BENGALI ISSHAR;So;0;L;;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) these are *not* |
|||
“Alphabetic”.''' |
|||
], |
|||
[[(0x0A01, 0x0A03)], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', True), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“0A01;GURMUKHI SIGN ADAK BINDI;Mn;0;NSM;;;;;N;;;;;” |
|||
“0A02;GURMUKHI SIGN BINDI;Mn;0;NSM;;;;;N;;;;;” |
|||
“0A03;GURMUKHI SIGN VISARGA;Mc;0;L;;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) these are |
|||
“Alphabetic”.''' |
|||
], |
|||
[[0x0A3C], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', False), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“0A3C;GURMUKHI SIGN NUKTA;Mn;7;NSM;;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) this is *not* |
|||
“Alphabetic”.''' |
|||
], |
|||
[[(0x0A3E, 0x0A40), (0x0A41, 0x0A42), (0x0A47, 0x0A48), |
|||
(0x0A4B, 0x0A4C)], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', True), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“0A3E;GURMUKHI VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;” |
|||
“0A3F;GURMUKHI VOWEL SIGN I;Mc;0;L;;;;;N;;;;;” |
|||
“0A40;GURMUKHI VOWEL SIGN II;Mc;0;L;;;;;N;;;;;” |
|||
“0A41;GURMUKHI VOWEL SIGN U;Mn;0;NSM;;;;;N;;;;;” |
|||
“0A42;GURMUKHI VOWEL SIGN UU;Mn;0;NSM;;;;;N;;;;;” |
|||
“0A47;GURMUKHI VOWEL SIGN EE;Mn;0;NSM;;;;;N;;;;;” |
|||
“0A48;GURMUKHI VOWEL SIGN AI;Mn;0;NSM;;;;;N;;;;;” |
|||
“0A4B;GURMUKHI VOWEL SIGN OO;Mn;0;NSM;;;;;N;;;;;” |
|||
“0A4C;GURMUKHI VOWEL SIGN AU;Mn;0;NSM;;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) these are |
|||
“Alphabetic”.''' |
|||
], |
|||
[[0x0A4D], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', False), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“0A4D;GURMUKHI SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) this is *not* |
|||
“Alphabetic”.''' |
|||
], |
|||
[[0x0A51, (0x0A70, 0x0A71), 0x0A75, (0x0A81, 0x0A83)], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', True), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“0A4D;GURMUKHI SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;” |
|||
“0A70;GURMUKHI TIPPI;Mn;0;NSM;;;;;N;;;;;” |
|||
“0A71;GURMUKHI ADDAK;Mn;0;NSM;;;;;N;;;;;” |
|||
“0A75;GURMUKHI SIGN YAKASH;Mn;0;NSM;;;;;N;;;;;” |
|||
“0A81;GUJARATI SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;;” |
|||
“0A82;GUJARATI SIGN ANUSVARA;Mn;0;NSM;;;;;N;;;;;” |
|||
“0A83;GUJARATI SIGN VISARGA;Mc;0;L;;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) these are |
|||
“Alphabetic”.''' |
|||
], |
|||
[[0x0ABC], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', False), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“0ABC;GUJARATI SIGN NUKTA;Mn;7;NSM;;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) this is *not* |
|||
“Alphabetic”.''' |
|||
], |
|||
[[(0x0ABE, 0x0AC5), (0x0AC7, 0x0AC9), (0x0ACB, 0x0ACC)], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', True), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“0ABE;GUJARATI VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;” |
|||
“0ABF;GUJARATI VOWEL SIGN I;Mc;0;L;;;;;N;;;;;” |
|||
“0AC0;GUJARATI VOWEL SIGN II;Mc;0;L;;;;;N;;;;;” |
|||
“0AC1;GUJARATI VOWEL SIGN U;Mn;0;NSM;;;;;N;;;;;” |
|||
“0AC2;GUJARATI VOWEL SIGN UU;Mn;0;NSM;;;;;N;;;;;” |
|||
“0AC3;GUJARATI VOWEL SIGN VOCALIC R;Mn;0;NSM;;;;;N;;;;;” |
|||
“0AC4;GUJARATI VOWEL SIGN VOCALIC RR;Mn;0;NSM;;;;;N;;;;;” |
|||
“0AC5;GUJARATI VOWEL SIGN CANDRA E;Mn;0;NSM;;;;;N;;;;;” |
|||
“0AC7;GUJARATI VOWEL SIGN E;Mn;0;NSM;;;;;N;;;;;” |
|||
“0AC8;GUJARATI VOWEL SIGN AI;Mn;0;NSM;;;;;N;;;;;” |
|||
“0AC9;GUJARATI VOWEL SIGN CANDRA O;Mc;0;L;;;;;N;;;;;” |
|||
“0ACB;GUJARATI VOWEL SIGN O;Mc;0;L;;;;;N;;;;;” |
|||
“0ACC;GUJARATI VOWEL SIGN AU;Mc;0;L;;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) these are |
|||
“Alphabetic”.''' |
|||
], |
|||
[[0x0ACD], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', False), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“0ACD;GUJARATI SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) this is *not* |
|||
“Alphabetic”.''' |
|||
], |
|||
[[(0x0AE2, 0x0AE3)], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', True), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“0AE2;GUJARATI VOWEL SIGN VOCALIC L;Mn;0;NSM;;;;;N;;;;;” |
|||
“0AE3;GUJARATI VOWEL SIGN VOCALIC LL;Mn;0;NSM;;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) these are |
|||
“Alphabetic”.''' |
|||
], |
|||
[[0x0AF1], |
|||
[('combining', False), ('combining_level3', False), |
|||
('alpha', False), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“0AF1;GUJARATI RUPEE SIGN;Sc;0;ET;;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) this is *not* |
|||
“Alphabetic”.''' |
|||
], |
|||
[[(0x0B01, 0x0B03)], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', True), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“0B01;ORIYA SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;;” |
|||
“0B02;ORIYA SIGN ANUSVARA;Mc;0;L;;;;;N;;;;;” |
|||
“0B03;ORIYA SIGN VISARGA;Mc;0;L;;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) these are |
|||
“Alphabetic”.''' |
|||
], |
|||
[[0x0B3C], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', False), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“0B3C;ORIYA SIGN NUKTA;Mn;7;NSM;;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) this is *not* |
|||
“Alphabetic”.''' |
|||
], |
|||
[[(0x0B3E, 0x0B44), (0x0B47, 0x0B48), (0x0B4B, 0x0B4C)], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', True), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“0B3E;ORIYA VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;” |
|||
“0B3F;ORIYA VOWEL SIGN I;Mn;0;NSM;;;;;N;;;;;” |
|||
“0B40;ORIYA VOWEL SIGN II;Mc;0;L;;;;;N;;;;;” |
|||
“0B41;ORIYA VOWEL SIGN U;Mn;0;NSM;;;;;N;;;;;” |
|||
“0B42;ORIYA VOWEL SIGN UU;Mn;0;NSM;;;;;N;;;;;” |
|||
“0B43;ORIYA VOWEL SIGN VOCALIC R;Mn;0;NSM;;;;;N;;;;;” |
|||
“0B44;ORIYA VOWEL SIGN VOCALIC RR;Mn;0;NSM;;;;;N;;;;;” |
|||
“0B47;ORIYA VOWEL SIGN E;Mc;0;L;;;;;N;;;;;” |
|||
“0B48;ORIYA VOWEL SIGN AI;Mc;0;L;0B47 0B56;;;;N;;;;;” |
|||
“0B4B;ORIYA VOWEL SIGN O;Mc;0;L;0B47 0B3E;;;;N;;;;;” |
|||
“0B4C;ORIYA VOWEL SIGN AU;Mc;0;L;0B47 0B57;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) these are |
|||
“Alphabetic”.''' |
|||
], |
|||
[[0x0B4D], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', False), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“0B4D;ORIYA SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) this is *not* |
|||
“Alphabetic”.''' |
|||
], |
|||
[[(0x0B56, 0x0B57), (0x0B62, 0x0B63)], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', True), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“0B56;ORIYA AI LENGTH MARK;Mn;0;NSM;;;;;N;;;;;” |
|||
“0B57;ORIYA AU LENGTH MARK;Mc;0;L;;;;;N;;;;;” |
|||
“0B62;ORIYA VOWEL SIGN VOCALIC L;Mn;0;NSM;;;;;N;;;;;” |
|||
“0B63;ORIYA VOWEL SIGN VOCALIC LL;Mn;0;NSM;;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) these are |
|||
“Alphabetic”.''' |
|||
], |
|||
[[0x0B70], |
|||
[('combining', False), ('combining_level3', False), |
|||
('alpha', False), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“0B70;ORIYA ISSHAR;So;0;L;;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) this is *not* |
|||
“Alphabetic”.''' |
|||
], |
|||
[[0x0B82], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', True), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“0B82;TAMIL SIGN ANUSVARA;Mn;0;NSM;;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) this is *not* |
|||
“Alphabetic”.''' |
|||
], |
|||
[[(0x0BBE, 0x0BC2), (0x0BC6, 0x0BC8), (0x0BCA, 0x0BCC)], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', True), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“0BBE;TAMIL VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;” |
|||
“0BBF;TAMIL VOWEL SIGN I;Mc;0;L;;;;;N;;;;;” |
|||
“0BC0;TAMIL VOWEL SIGN II;Mn;0;NSM;;;;;N;;;;;” |
|||
“0BC1;TAMIL VOWEL SIGN U;Mc;0;L;;;;;N;;;;;” |
|||
“0BC2;TAMIL VOWEL SIGN UU;Mc;0;L;;;;;N;;;;;” |
|||
“0BC6;TAMIL VOWEL SIGN E;Mc;0;L;;;;;N;;;;;” |
|||
“0BC7;TAMIL VOWEL SIGN EE;Mc;0;L;;;;;N;;;;;” |
|||
“0BC8;TAMIL VOWEL SIGN AI;Mc;0;L;;;;;N;;;;;” |
|||
“0BCA;TAMIL VOWEL SIGN O;Mc;0;L;0BC6 0BBE;;;;N;;;;;” |
|||
“0BCB;TAMIL VOWEL SIGN OO;Mc;0;L;0BC7 0BBE;;;;N;;;;;” |
|||
“0BCC;TAMIL VOWEL SIGN AU;Mc;0;L;0BC6 0BD7;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) these are |
|||
“Alphabetic”.''' |
|||
], |
|||
[[0x0BCD], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', False), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“0BCD;TAMIL SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) this is *not* |
|||
“Alphabetic”.''' |
|||
], |
|||
[[0x0BD7], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', True), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“0BD7;TAMIL AU LENGTH MARK;Mc;0;L;;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) this is *not* |
|||
“Alphabetic”.''' |
|||
], |
|||
[[(0x0BF0, 0x0BFA)], |
|||
[('combining', False), ('combining_level3', False), |
|||
('alpha', False), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“0BF0;TAMIL NUMBER TEN;No;0;L;;;;10;N;;;;;” |
|||
“0BF1;TAMIL NUMBER ONE HUNDRED;No;0;L;;;;100;N;;;;;” |
|||
“0BF2;TAMIL NUMBER ONE THOUSAND;No;0;L;;;;1000;N;;;;;” |
|||
“0BF3;TAMIL DAY SIGN;So;0;ON;;;;;N;;;;;” |
|||
“0BF4;TAMIL MONTH SIGN;So;0;ON;;;;;N;;;;;” |
|||
“0BF5;TAMIL YEAR SIGN;So;0;ON;;;;;N;;;;;” |
|||
“0BF6;TAMIL DEBIT SIGN;So;0;ON;;;;;N;;;;;” |
|||
“0BF7;TAMIL CREDIT SIGN;So;0;ON;;;;;N;;;;;” |
|||
“0BF8;TAMIL AS ABOVE SIGN;So;0;ON;;;;;N;;;;;” |
|||
“0BF9;TAMIL RUPEE SIGN;Sc;0;ET;;;;;N;;;;;” |
|||
“0BFA;TAMIL NUMBER SIGN;So;0;ON;;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) this is *not* |
|||
“Alphabetic”.''' |
|||
], |
|||
[[(0x0C01, 0x0C03)], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', True), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“0C01;TELUGU SIGN CANDRABINDU;Mc;0;L;;;;;N;;;;;” |
|||
“0C02;TELUGU SIGN ANUSVARA;Mc;0;L;;;;;N;;;;;” |
|||
“0C03;TELUGU SIGN VISARGA;Mc;0;L;;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) these are |
|||
“Alphabetic”.''' |
|||
], |
|||
[[(0x0C3E, 0x0C44), (0x0C46, 0x0C48), (0x0C4A, 0x0C4C)], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', True), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“0C3E;TELUGU VOWEL SIGN AA;Mn;0;NSM;;;;;N;;;;;” |
|||
“0C3F;TELUGU VOWEL SIGN I;Mn;0;NSM;;;;;N;;;;;” |
|||
“0C40;TELUGU VOWEL SIGN II;Mn;0;NSM;;;;;N;;;;;” |
|||
“0C41;TELUGU VOWEL SIGN U;Mc;0;L;;;;;N;;;;;” |
|||
“0C42;TELUGU VOWEL SIGN UU;Mc;0;L;;;;;N;;;;;” |
|||
“0C43;TELUGU VOWEL SIGN VOCALIC R;Mc;0;L;;;;;N;;;;;” |
|||
“0C44;TELUGU VOWEL SIGN VOCALIC RR;Mc;0;L;;;;;N;;;;;” |
|||
“0C46;TELUGU VOWEL SIGN E;Mn;0;NSM;;;;;N;;;;;” |
|||
“0C47;TELUGU VOWEL SIGN EE;Mn;0;NSM;;;;;N;;;;;” |
|||
“0C48;TELUGU VOWEL SIGN AI;Mn;0;NSM;0C46 0C56;;;;N;;;;;” |
|||
“0C4A;TELUGU VOWEL SIGN O;Mn;0;NSM;;;;;N;;;;;” |
|||
“0C4B;TELUGU VOWEL SIGN OO;Mn;0;NSM;;;;;N;;;;;” |
|||
“0C4C;TELUGU VOWEL SIGN AU;Mn;0;NSM;;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) these are |
|||
“Alphabetic”.''' |
|||
], |
|||
[[0x0C4D], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', False), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“0C4D;TELUGU SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) these are *not* |
|||
“Alphabetic”.''' |
|||
], |
|||
[[(0x0C55, 0x0C56), (0x0C62, 0x0C63)], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', True), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“0C55;TELUGU LENGTH MARK;Mn;84;NSM;;;;;N;;;;;” |
|||
“0C56;TELUGU AI LENGTH MARK;Mn;91;NSM;;;;;N;;;;;” |
|||
“0C62;TELUGU VOWEL SIGN VOCALIC L;Mn;0;NSM;;;;;N;;;;;” |
|||
“0C63;TELUGU VOWEL SIGN VOCALIC LL;Mn;0;NSM;;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) these are |
|||
“Alphabetic”.''' |
|||
], |
|||
[[(0x0C78, 0x0C7F)], |
|||
[('combining', False), ('combining_level3', False), |
|||
('alpha', False), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“0C78;TELUGU FRACTION DIGIT ZERO FOR ODD POWERS OF FOUR; |
|||
No;0;ON;;;;0;N;;;;;” |
|||
“0C79;TELUGU FRACTION DIGIT ONE FOR ODD POWERS OF FOUR; |
|||
No;0;ON;;;;1;N;;;;;” |
|||
“0C7A;TELUGU FRACTION DIGIT TWO FOR ODD POWERS OF FOUR; |
|||
No;0;ON;;;;2;N;;;;;” |
|||
“0C7B;TELUGU FRACTION DIGIT THREE FOR ODD POWERS OF FOUR; |
|||
No;0;ON;;;;3;N;;;;;” |
|||
“0C7C;TELUGU FRACTION DIGIT ONE FOR EVEN POWERS OF FOUR; |
|||
No;0;ON;;;;1;N;;;;;” |
|||
“0C7D;TELUGU FRACTION DIGIT TWO FOR EVEN POWERS OF FOUR; |
|||
No;0;ON;;;;2;N;;;;;” |
|||
“0C7E;TELUGU FRACTION DIGIT THREE FOR EVEN POWERS OF FOUR; |
|||
No;0;ON;;;;3;N;;;;;” |
|||
“0C7F;TELUGU SIGN TUUMU;So;0;L;;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) these are *not* |
|||
“Alphabetic”.''' |
|||
], |
|||
[[(0x0C82, 0x0C83)], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', True), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“0C81;KANNADA SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;;” |
|||
“0C82;KANNADA SIGN ANUSVARA;Mc;0;L;;;;;N;;;;;” |
|||
“0C83;KANNADA SIGN VISARGA;Mc;0;L;;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) these are |
|||
“Alphabetic”.''' |
|||
], |
|||
[[0x0CBC], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', False), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“0CBC;KANNADA SIGN NUKTA;Mn;7;NSM;;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) these are |
|||
“Alphabetic”.''' |
|||
], |
|||
[[(0x0CBE, 0x0CC4), (0x0CC6, 0x0CC8), (0x0CCA, 0x0CCC)], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', True), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“0CBE;KANNADA VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;” |
|||
“0CBF;KANNADA VOWEL SIGN I;Mn;0;L;;;;;N;;;;;” |
|||
“0CC0;KANNADA VOWEL SIGN II;Mc;0;L;0CBF 0CD5;;;;N;;;;;” |
|||
“0CC1;KANNADA VOWEL SIGN U;Mc;0;L;;;;;N;;;;;” |
|||
“0CC2;KANNADA VOWEL SIGN UU;Mc;0;L;;;;;N;;;;;” |
|||
“0CC3;KANNADA VOWEL SIGN VOCALIC R;Mc;0;L;;;;;N;;;;;” |
|||
“0CC4;KANNADA VOWEL SIGN VOCALIC RR;Mc;0;L;;;;;N;;;;;” |
|||
“0CC6;KANNADA VOWEL SIGN E;Mn;0;L;;;;;N;;;;;” |
|||
“0CC7;KANNADA VOWEL SIGN EE;Mc;0;L;0CC6 0CD5;;;;N;;;;;” |
|||
“0CC8;KANNADA VOWEL SIGN AI;Mc;0;L;0CC6 0CD6;;;;N;;;;;” |
|||
“0CCA;KANNADA VOWEL SIGN O;Mc;0;L;0CC6 0CC2;;;;N;;;;;” |
|||
“0CCB;KANNADA VOWEL SIGN OO;Mc;0;L;0CCA 0CD5;;;;N;;;;;” |
|||
“0CCC;KANNADA VOWEL SIGN AU;Mn;0;NSM;;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) these are |
|||
“Alphabetic”.''' |
|||
], |
|||
[[0x0CCD], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', False), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“0CCD;KANNADA SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) these are *not* |
|||
“Alphabetic”.''' |
|||
], |
|||
[[(0x0CD5, 0x0CD6), (0x0CE2, 0x0CE3)], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', True), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
0CD5;KANNADA LENGTH MARK;Mc;0;L;;;;;N;;;;; |
|||
0CD6;KANNADA AI LENGTH MARK;Mc;0;L;;;;;N;;;;; |
|||
0CE2;KANNADA VOWEL SIGN VOCALIC L;Mn;0;NSM;;;;;N;;;;; |
|||
0CE3;KANNADA VOWEL SIGN VOCALIC LL;Mn;0;NSM;;;;;N;;;;; |
|||
According to DerivedCoreProperties.txt (7.0.0) these are |
|||
“Alphabetic”.''' |
|||
], |
|||
[[(0x0D02, 0x0D03), (0x0D3E, 0x0D44), (0x0D46, 0x0D48), |
|||
(0x0D4A, 0x0D4C)], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', True), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“0D02;MALAYALAM SIGN ANUSVARA;Mc;0;L;;;;;N;;;;;” |
|||
“0D03;MALAYALAM SIGN VISARGA;Mc;0;L;;;;;N;;;;;” |
|||
“0D3E;MALAYALAM VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;” |
|||
“0D3F;MALAYALAM VOWEL SIGN I;Mc;0;L;;;;;N;;;;;” |
|||
“0D40;MALAYALAM VOWEL SIGN II;Mc;0;L;;;;;N;;;;;” |
|||
“0D41;MALAYALAM VOWEL SIGN U;Mn;0;NSM;;;;;N;;;;;” |
|||
“0D42;MALAYALAM VOWEL SIGN UU;Mn;0;NSM;;;;;N;;;;;” |
|||
“0D43;MALAYALAM VOWEL SIGN VOCALIC R;Mn;0;NSM;;;;;N;;;;;” |
|||
“0D44;MALAYALAM VOWEL SIGN VOCALIC RR;Mn;0;NSM;;;;;N;;;;;” |
|||
“0D46;MALAYALAM VOWEL SIGN E;Mc;0;L;;;;;N;;;;;” |
|||
“0D47;MALAYALAM VOWEL SIGN EE;Mc;0;L;;;;;N;;;;;” |
|||
“0D48;MALAYALAM VOWEL SIGN AI;Mc;0;L;;;;;N;;;;;” |
|||
“0D4A;MALAYALAM VOWEL SIGN O;Mc;0;L;0D46 0D3E;;;;N;;;;;” |
|||
“0D4B;MALAYALAM VOWEL SIGN OO;Mc;0;L;0D47 0D3E;;;;N;;;;;” |
|||
“0D4C;MALAYALAM VOWEL SIGN AU;Mc;0;L;0D46 0D57;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) these are |
|||
“Alphabetic”.''' |
|||
], |
|||
[[0x0D4D], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', False), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“0D4D;MALAYALAM SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) these are |
|||
“Alphabetic”.''' |
|||
], |
|||
[[0x0D57, (0x0D62, 0x0D63)], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', True), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“0D57;MALAYALAM AU LENGTH MARK;Mc;0;L;;;;;N;;;;;” |
|||
“0D62;MALAYALAM VOWEL SIGN VOCALIC L;Mn;0;NSM;;;;;N;;;;;” |
|||
“0D63;MALAYALAM VOWEL SIGN VOCALIC LL;Mn;0;NSM;;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) these are |
|||
“Alphabetic”.''' |
|||
], |
|||
[[(0x0D70, 0x0D79)], |
|||
[('combining', False), ('combining_level3', False), |
|||
('alpha', False), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“0D70;MALAYALAM NUMBER TEN;No;0;L;;;;10;N;;;;;” |
|||
“0D71;MALAYALAM NUMBER ONE HUNDRED;No;0;L;;;;100;N;;;;;” |
|||
“0D72;MALAYALAM NUMBER ONE THOUSAND;No;0;L;;;;1000;N;;;;;” |
|||
“0D73;MALAYALAM FRACTION ONE QUARTER;No;0;L;;;;1/4;N;;;;;” |
|||
“0D74;MALAYALAM FRACTION ONE HALF;No;0;L;;;;1/2;N;;;;;” |
|||
“0D75;MALAYALAM FRACTION THREE QUARTERS;No;0;L;;;;3/4;N;;;;;” |
|||
“0D79;MALAYALAM DATE MARK;So;0;L;;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) these are *not* |
|||
“Alphabetic”.''' |
|||
], |
|||
[[(0x0D82, 0x0D83)], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', True), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“0D82;SINHALA SIGN ANUSVARAYA;Mc;0;L;;;;;N;;;;;” |
|||
“0D83;SINHALA SIGN VISARGAYA;Mc;0;L;;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) these are |
|||
“Alphabetic”.''' |
|||
], |
|||
[[0x0DCA], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', False), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“0DCA;SINHALA SIGN AL-LAKUNA;Mn;9;NSM;;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) these are *not* |
|||
“Alphabetic”.''' |
|||
], |
|||
[[(0x0DCF, 0x0DD4), 0x0DD6, (0x0DD8, 0x0DDF), (0x0DF2, 0x0DF3)], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', True), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“0DCF;SINHALA VOWEL SIGN AELA-PILLA;Mc;0;L;;;;;N;;;;;” |
|||
“0DD0;SINHALA VOWEL SIGN KETTI AEDA-PILLA;Mc;0;L;;;;;N;;;;;” |
|||
“0DD1;SINHALA VOWEL SIGN DIGA AEDA-PILLA;Mc;0;L;;;;;N;;;;;” |
|||
“0DD2;SINHALA VOWEL SIGN KETTI IS-PILLA;Mn;0;NSM;;;;;N;;;;;” |
|||
“0DD3;SINHALA VOWEL SIGN DIGA IS-PILLA;Mn;0;NSM;;;;;N;;;;;” |
|||
“0DD4;SINHALA VOWEL SIGN KETTI PAA-PILLA;Mn;0;NSM;;;;;N;;;;;” |
|||
“0DD6;SINHALA VOWEL SIGN DIGA PAA-PILLA;Mn;0;NSM;;;;;N;;;;;” |
|||
“0DD8;SINHALA VOWEL SIGN GAETTA-PILLA;Mc;0;L;;;;;N;;;;;” |
|||
“0DD9;SINHALA VOWEL SIGN KOMBUVA;Mc;0;L;;;;;N;;;;;” |
|||
“0DDA;SINHALA VOWEL SIGN DIGA KOMBUVA;Mc;0;L;0DD9 0DCA;;;;N;;;;;” |
|||
“0DDB;SINHALA VOWEL SIGN KOMBU DEKA;Mc;0;L;;;;;N;;;;;” |
|||
“0DDC;SINHALA VOWEL SIGN KOMBUVA HAA AELA-PILLA; |
|||
Mc;0;L;0DD9 0DCF;;;;N;;;;;” |
|||
“0DDD;SINHALA VOWEL SIGN KOMBUVA HAA DIGA AELA-PILLA; |
|||
Mc;0;L;0DDC 0DCA;;;;N;;;;;” |
|||
“0DDE;SINHALA VOWEL SIGN KOMBUVA HAA GAYANUKITTA; |
|||
Mc;0;L;0DD9 0DDF;;;;N;;;;;” |
|||
“0DDF;SINHALA VOWEL SIGN GAYANUKITTA;Mc;0;L;;;;;N;;;;;” |
|||
“0DF2;SINHALA VOWEL SIGN DIGA GAETTA-PILLA;Mc;0;L;;;;;N;;;;;” |
|||
“0DF3;SINHALA VOWEL SIGN DIGA GAYANUKITTA;Mc;0;L;;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) these are |
|||
“Alphabetic”.''' |
|||
], |
|||
[[0x0DF4], |
|||
[('combining', False), ('combining_level3', False), |
|||
('alpha', False), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“0DF4;SINHALA PUNCTUATION KUNDDALIYA;Po;0;L;;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) these are *not* |
|||
“Alphabetic”.''' |
|||
], |
|||
[[(0xA789, 0xA78A)], |
|||
[('combining', False), ('combining_level3', False), |
|||
('alpha', False), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“A789;MODIFIER LETTER COLON;Sk;0;L;;;;;N;;;;;” |
|||
“A78A;MODIFIER LETTER SHORT EQUALS SIGN;Sk;0;L;;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) these are *not* |
|||
“Alphabetic”.''' |
|||
], |
|||
[[(0xA926, 0xA92A)], |
|||
[('combining', True), ('combining_level3', True), |
|||
('alpha', True), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“A926;KAYAH LI VOWEL UE;Mn;0;NSM;;;;;N;;;;;” |
|||
“A927;KAYAH LI VOWEL E;Mn;0;NSM;;;;;N;;;;;” |
|||
“A928;KAYAH LI VOWEL U;Mn;0;NSM;;;;;N;;;;;” |
|||
“A929;KAYAH LI VOWEL EE;Mn;0;NSM;;;;;N;;;;;” |
|||
“A92A;KAYAH LI VOWEL O;Mn;0;NSM;;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) these are |
|||
“Alphabetic”.''' |
|||
], |
|||
[[(0xA92B, 0xA92D)], |
|||
[('combining', True), ('combining_level3', False), |
|||
('alpha', False), ('lower', False), ('upper', False), |
|||
('tolower', False), ('toupper', False), ('totitle', False)], |
|||
''' |
|||
“A92B;KAYAH LI TONE PLOPHU;Mn;220;NSM;;;;;N;;;;;” |
|||
“A92C;KAYAH LI TONE CALYA;Mn;220;NSM;;;;;N;;;;;” |
|||
“A92D;KAYAH LI TONE CALYA PLOPHU;Mn;220;NSM;;;;;N;;;;;” |
|||
According to DerivedCoreProperties.txt (7.0.0) these are *not* |
|||
“Alphabetic”.''' |
|||
] |
|||
] |
|||
@ -0,0 +1,751 @@ |
|||
#!/usr/bin/python3 |
|||
# |
|||
# Generate a Unicode conforming LC_CTYPE category from a UnicodeData file. |
|||
# Copyright (C) 2014, 2015 Free Software Foundation, Inc. |
|||
# This file is part of the GNU C Library. |
|||
# Based on gen-unicode-ctype.c by Bruno Haible <haible@clisp.cons.org>, 2000. |
|||
# |
|||
# The GNU C Library is free software; you can redistribute it and/or |
|||
# modify it under the terms of the GNU Lesser General Public |
|||
# License as published by the Free Software Foundation; either |
|||
# version 2.1 of the License, or (at your option) any later version. |
|||
# |
|||
# The GNU C Library is distributed in the hope that it will be useful, |
|||
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|||
# Lesser General Public License for more details. |
|||
# |
|||
# You should have received a copy of the GNU Lesser General Public |
|||
# License along with the GNU C Library; if not, see |
|||
# <http://www.gnu.org/licenses/>. |
|||
|
|||
''' |
|||
Generate a Unicode conforming LC_CTYPE category from UnicodeData.txt and |
|||
DerivedCoreProperties.txt files. |
|||
|
|||
To see how this script is used, call it with the “-h” option: |
|||
|
|||
$ ./gen_unicode_ctype.py -h |
|||
… prints usage message … |
|||
''' |
|||
|
|||
import argparse |
|||
import sys |
|||
import time |
|||
import re |
|||
|
|||
# Dictionary holding the entire contents of the UnicodeData.txt file |
|||
# |
|||
# Contents of this dictionary look like this: |
|||
# |
|||
# {0: {'category': 'Cc', |
|||
# 'title': None, |
|||
# 'digit': '', |
|||
# 'name': '<control>', |
|||
# 'bidi': 'BN', |
|||
# 'combining': '0', |
|||
# 'comment': '', |
|||
# 'oldname': 'NULL', |
|||
# 'decomposition': '', |
|||
# 'upper': None, |
|||
# 'mirrored': 'N', |
|||
# 'lower': None, |
|||
# 'decdigit': '', |
|||
# 'numeric': ''}, |
|||
# … |
|||
# } |
|||
UNICODE_ATTRIBUTES = {} |
|||
|
|||
# Dictionary holding the entire contents of the DerivedCoreProperties.txt file |
|||
# |
|||
# Contents of this dictionary look like this: |
|||
# |
|||
# {917504: ['Default_Ignorable_Code_Point'], |
|||
# 917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'], |
|||
# … |
|||
# } |
|||
DERIVED_CORE_PROPERTIES = {} |
|||
|
|||
def fill_attribute(code_point, fields): |
|||
'''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields. |
|||
|
|||
One entry in the UNICODE_ATTRIBUTES dictionary represents one line |
|||
in the UnicodeData.txt file. |
|||
|
|||
''' |
|||
UNICODE_ATTRIBUTES[code_point] = { |
|||
'name': fields[1], # Character name |
|||
'category': fields[2], # General category |
|||
'combining': fields[3], # Canonical combining classes |
|||
'bidi': fields[4], # Bidirectional category |
|||
'decomposition': fields[5], # Character decomposition mapping |
|||
'decdigit': fields[6], # Decimal digit value |
|||
'digit': fields[7], # Digit value |
|||
'numeric': fields[8], # Numeric value |
|||
'mirrored': fields[9], # mirrored |
|||
'oldname': fields[10], # Old Unicode 1.0 name |
|||
'comment': fields[11], # comment |
|||
# Uppercase mapping |
|||
'upper': int(fields[12], 16) if fields[12] else None, |
|||
# Lowercase mapping |
|||
'lower': int(fields[13], 16) if fields[13] else None, |
|||
# Titlecase mapping |
|||
'title': int(fields[14], 16) if fields[14] else None, |
|||
} |
|||
|
|||
def fill_attributes(filename): |
|||
'''Stores the entire contents of the UnicodeData.txt file |
|||
in the UNICODE_ATTRIBUTES dictionary. |
|||
|
|||
A typical line for a single code point in UnicodeData.txt looks |
|||
like this: |
|||
|
|||
0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061; |
|||
|
|||
Code point ranges are indicated by pairs of lines like this: |
|||
|
|||
4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;; |
|||
9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; |
|||
''' |
|||
with open(filename, mode='r') as unicode_data_file: |
|||
fields_start = [] |
|||
for line in unicode_data_file: |
|||
fields = line.strip().split(';') |
|||
if len(fields) != 15: |
|||
sys.stderr.write( |
|||
'short line in file "%(f)s": %(l)s\n' %{ |
|||
'f': filename, 'l': line}) |
|||
exit(1) |
|||
if fields[2] == 'Cs': |
|||
# Surrogates are UTF-16 artefacts, |
|||
# not real characters. Ignore them. |
|||
fields_start = [] |
|||
continue |
|||
if fields[1].endswith(', First>'): |
|||
fields_start = fields |
|||
fields_start[1] = fields_start[1].split(',')[0][1:] |
|||
continue |
|||
if fields[1].endswith(', Last>'): |
|||
fields[1] = fields[1].split(',')[0][1:] |
|||
if fields[1:] != fields_start[1:]: |
|||
sys.stderr.write( |
|||
'broken code point range in file "%(f)s": %(l)s\n' %{ |
|||
'f': filename, 'l': line}) |
|||
exit(1) |
|||
for code_point in range( |
|||
int(fields_start[0], 16), |
|||
int(fields[0], 16)+1): |
|||
fill_attribute(code_point, fields) |
|||
fields_start = [] |
|||
continue |
|||
fill_attribute(int(fields[0], 16), fields) |
|||
fields_start = [] |
|||
|
|||
def fill_derived_core_properties(filename): |
|||
'''Stores the entire contents of the DerivedCoreProperties.txt file |
|||
in the DERIVED_CORE_PROPERTIES dictionary. |
|||
|
|||
Lines in DerivedCoreProperties.txt are either a code point range like |
|||
this: |
|||
|
|||
0061..007A ; Lowercase # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z |
|||
|
|||
or a single code point like this: |
|||
|
|||
00AA ; Lowercase # Lo FEMININE ORDINAL INDICATOR |
|||
|
|||
''' |
|||
with open(filename, mode='r') as derived_core_properties_file: |
|||
for line in derived_core_properties_file: |
|||
match = re.match( |
|||
r'^(?P<codepoint1>[0-9A-F]{4,6})' |
|||
+ r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?' |
|||
+ r'\s*;\s*(?P<property>[a-zA-Z_]+)', |
|||
line) |
|||
if not match: |
|||
continue |
|||
start = match.group('codepoint1') |
|||
end = match.group('codepoint2') |
|||
if not end: |
|||
end = start |
|||
for code_point in range(int(start, 16), int(end, 16)+1): |
|||
prop = match.group('property') |
|||
if code_point in DERIVED_CORE_PROPERTIES: |
|||
DERIVED_CORE_PROPERTIES[code_point].append(prop) |
|||
else: |
|||
DERIVED_CORE_PROPERTIES[code_point] = [prop] |
|||
|
|||
def to_upper(code_point): |
|||
'''Returns the code point of the uppercase version |
|||
of the given code point''' |
|||
if (UNICODE_ATTRIBUTES[code_point]['name'] |
|||
and UNICODE_ATTRIBUTES[code_point]['upper']): |
|||
return UNICODE_ATTRIBUTES[code_point]['upper'] |
|||
else: |
|||
return code_point |
|||
|
|||
def to_lower(code_point): |
|||
'''Returns the code point of the lowercase version |
|||
of the given code point''' |
|||
if (UNICODE_ATTRIBUTES[code_point]['name'] |
|||
and UNICODE_ATTRIBUTES[code_point]['lower']): |
|||
return UNICODE_ATTRIBUTES[code_point]['lower'] |
|||
else: |
|||
return code_point |
|||
|
|||
def to_title(code_point): |
|||
'''Returns the code point of the titlecase version |
|||
of the given code point''' |
|||
if (UNICODE_ATTRIBUTES[code_point]['name'] |
|||
and UNICODE_ATTRIBUTES[code_point]['title']): |
|||
return UNICODE_ATTRIBUTES[code_point]['title'] |
|||
else: |
|||
return code_point |
|||
|
|||
def is_upper(code_point): |
|||
'''Checks whether the character with this code point is uppercase''' |
|||
return (to_lower(code_point) != code_point |
|||
or (code_point in DERIVED_CORE_PROPERTIES |
|||
and 'Uppercase' in DERIVED_CORE_PROPERTIES[code_point])) |
|||
|
|||
def is_lower(code_point): |
|||
'''Checks whether the character with this code point is lowercase''' |
|||
# Some characters are defined as “Lowercase” in |
|||
# DerivedCoreProperties.txt but do not have a mapping to upper |
|||
# case. For example, ꜰ U+A72F “LATIN LETTER SMALL CAPITAL F” is |
|||
# one of these. |
|||
return (to_upper(code_point) != code_point |
|||
# <U00DF> is lowercase, but without simple to_upper mapping. |
|||
or code_point == 0x00DF |
|||
or (code_point in DERIVED_CORE_PROPERTIES |
|||
and 'Lowercase' in DERIVED_CORE_PROPERTIES[code_point])) |
|||
|
|||
def is_alpha(code_point): |
|||
'''Checks whether the character with this code point is alphabetic''' |
|||
return ((code_point in DERIVED_CORE_PROPERTIES |
|||
and |
|||
'Alphabetic' in DERIVED_CORE_PROPERTIES[code_point]) |
|||
or |
|||
# Consider all the non-ASCII digits as alphabetic. |
|||
# ISO C 99 forbids us to have them in category “digit”, |
|||
# but we want iswalnum to return true on them. |
|||
(UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd' |
|||
and not (code_point >= 0x0030 and code_point <= 0x0039))) |
|||
|
|||
def is_digit(code_point): |
|||
'''Checks whether the character with this code point is a digit''' |
|||
if False: |
|||
return (UNICODE_ATTRIBUTES[code_point]['name'] |
|||
and UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd') |
|||
# Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without |
|||
# a zero. Must add <0> in front of them by hand. |
|||
else: |
|||
# SUSV2 gives us some freedom for the "digit" category, but ISO C 99 |
|||
# takes it away: |
|||
# 7.25.2.1.5: |
|||
# The iswdigit function tests for any wide character that |
|||
# corresponds to a decimal-digit character (as defined in 5.2.1). |
|||
# 5.2.1: |
|||
# the 10 decimal digits 0 1 2 3 4 5 6 7 8 9 |
|||
return (code_point >= 0x0030 and code_point <= 0x0039) |
|||
|
|||
def is_outdigit(code_point): |
|||
'''Checks whether the character with this code point is outdigit''' |
|||
return (code_point >= 0x0030 and code_point <= 0x0039) |
|||
|
|||
def is_blank(code_point): |
|||
'''Checks whether the character with this code point is blank''' |
|||
return (code_point == 0x0009 # '\t' |
|||
# Category Zs without mention of '<noBreak>' |
|||
or (UNICODE_ATTRIBUTES[code_point]['name'] |
|||
and UNICODE_ATTRIBUTES[code_point]['category'] == 'Zs' |
|||
and '<noBreak>' not in |
|||
UNICODE_ATTRIBUTES[code_point]['decomposition'])) |
|||
|
|||
def is_space(code_point): |
|||
'''Checks whether the character with this code point is a space''' |
|||
# Don’t make U+00A0 a space. Non-breaking space means that all programs |
|||
# should treat it like a punctuation character, not like a space. |
|||
return (code_point == 0x0020 # ' ' |
|||
or code_point == 0x000C # '\f' |
|||
or code_point == 0x000A # '\n' |
|||
or code_point == 0x000D # '\r' |
|||
or code_point == 0x0009 # '\t' |
|||
or code_point == 0x000B # '\v' |
|||
# Categories Zl, Zp, and Zs without mention of "<noBreak>" |
|||
or (UNICODE_ATTRIBUTES[code_point]['name'] |
|||
and |
|||
(UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp'] |
|||
or |
|||
(UNICODE_ATTRIBUTES[code_point]['category'] in ['Zs'] |
|||
and |
|||
'<noBreak>' not in |
|||
UNICODE_ATTRIBUTES[code_point]['decomposition'])))) |
|||
|
|||
def is_cntrl(code_point): |
|||
'''Checks whether the character with this code point is |
|||
a control character''' |
|||
return (UNICODE_ATTRIBUTES[code_point]['name'] |
|||
and (UNICODE_ATTRIBUTES[code_point]['name'] == '<control>' |
|||
or |
|||
UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp'])) |
|||
|
|||
def is_xdigit(code_point): |
|||
'''Checks whether the character with this code point is |
|||
a hexadecimal digit''' |
|||
if False: |
|||
return (is_digit(code_point) |
|||
or (code_point >= 0x0041 and code_point <= 0x0046) |
|||
or (code_point >= 0x0061 and code_point <= 0x0066)) |
|||
else: |
|||
# SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99 |
|||
# takes it away: |
|||
# 7.25.2.1.12: |
|||
# The iswxdigit function tests for any wide character that |
|||
# corresponds to a hexadecimal-digit character (as defined |
|||
# in 6.4.4.1). |
|||
# 6.4.4.1: |
|||
# hexadecimal-digit: one of |
|||
# 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F |
|||
return ((code_point >= 0x0030 and code_point <= 0x0039) |
|||
or (code_point >= 0x0041 and code_point <= 0x0046) |
|||
or (code_point >= 0x0061 and code_point <= 0x0066)) |
|||
|
|||
def is_graph(code_point): |
|||
'''Checks whether the character with this code point is |
|||
a graphical character''' |
|||
return (UNICODE_ATTRIBUTES[code_point]['name'] |
|||
and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>' |
|||
and not is_space(code_point)) |
|||
|
|||
def is_print(code_point): |
|||
'''Checks whether the character with this code point is printable''' |
|||
return (UNICODE_ATTRIBUTES[code_point]['name'] |
|||
and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>' |
|||
and UNICODE_ATTRIBUTES[code_point]['category'] not in ['Zl', 'Zp']) |
|||
|
|||
def is_punct(code_point): |
|||
'''Checks whether the character with this code point is punctuation''' |
|||
if False: |
|||
return (UNICODE_ATTRIBUTES[code_point]['name'] |
|||
and UNICODE_ATTRIBUTES[code_point]['category'].startswith('P')) |
|||
else: |
|||
# The traditional POSIX definition of punctuation is every graphic, |
|||
# non-alphanumeric character. |
|||
return (is_graph(code_point) |
|||
and not is_alpha(code_point) |
|||
and not is_digit(code_point)) |
|||
|
|||
def is_combining(code_point): |
|||
'''Checks whether the character with this code point is |
|||
a combining character''' |
|||
# Up to Unicode 3.0.1 we took the Combining property from the PropList.txt |
|||
# file. In 3.0.1 it was identical to the union of the general categories |
|||
# "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the |
|||
# PropList.txt file, so we take the latter definition. |
|||
return (UNICODE_ATTRIBUTES[code_point]['name'] |
|||
and |
|||
UNICODE_ATTRIBUTES[code_point]['category'] in ['Mn', 'Mc', 'Me']) |
|||
|
|||
def is_combining_level3(code_point): |
|||
'''Checks whether the character with this code point is |
|||
a combining level3 character''' |
|||
return (is_combining(code_point) |
|||
and |
|||
int(UNICODE_ATTRIBUTES[code_point]['combining']) in range(0, 200)) |
|||
|
|||
def ucs_symbol(code_point): |
|||
'''Return the UCS symbol string for a Unicode character.''' |
|||
if code_point < 0x10000: |
|||
return '<U{:04X}>'.format(code_point) |
|||
else: |
|||
return '<U{:08X}>'.format(code_point) |
|||
|
|||
def ucs_symbol_range(code_point_low, code_point_high): |
|||
'''Returns a string UCS symbol string for a code point range. |
|||
|
|||
Example: |
|||
|
|||
<U0041>..<U005A> |
|||
''' |
|||
return ucs_symbol(code_point_low) + '..' + ucs_symbol(code_point_high) |
|||
|
|||
def code_point_ranges(is_class_function): |
|||
'''Returns a list of ranges of code points for which is_class_function |
|||
returns True. |
|||
|
|||
Example: |
|||
|
|||
[[65, 90], [192, 214], [216, 222], [256], … ] |
|||
''' |
|||
cp_ranges = [] |
|||
for code_point in sorted(UNICODE_ATTRIBUTES): |
|||
if is_class_function(code_point): |
|||
if (cp_ranges |
|||
and cp_ranges[-1][-1] == code_point - 1): |
|||
if len(cp_ranges[-1]) == 1: |
|||
cp_ranges[-1].append(code_point) |
|||
else: |
|||
cp_ranges[-1][-1] = code_point |
|||
else: |
|||
cp_ranges.append([code_point]) |
|||
return cp_ranges |
|||
|
|||
def output_charclass(i18n_file, class_name, is_class_function): |
|||
'''Output a LC_CTYPE character class section |
|||
|
|||
Example: |
|||
|
|||
upper / |
|||
<U0041>..<U005A>;<U00C0>..<U00D6>;<U00D8>..<U00DE>;<U0100>;<U0102>;/ |
|||
… |
|||
<U0001D790>..<U0001D7A8>;<U0001D7CA>;<U0001F130>..<U0001F149>;/ |
|||
<U0001F150>..<U0001F169>;<U0001F170>..<U0001F189> |
|||
''' |
|||
cp_ranges = code_point_ranges(is_class_function) |
|||
if cp_ranges: |
|||
i18n_file.write('%s /\n' %class_name) |
|||
max_column = 75 |
|||
prefix = ' ' |
|||
line = prefix |
|||
range_string = '' |
|||
for code_point_range in cp_ranges: |
|||
if line.strip(): |
|||
line += ';' |
|||
if len(code_point_range) == 1: |
|||
range_string = ucs_symbol(code_point_range[0]) |
|||
else: |
|||
range_string = ucs_symbol_range( |
|||
code_point_range[0], code_point_range[-1]) |
|||
if len(line+range_string) > max_column: |
|||
i18n_file.write(line+'/\n') |
|||
line = prefix |
|||
line += range_string |
|||
if line.strip(): |
|||
i18n_file.write(line+'\n') |
|||
i18n_file.write('\n') |
|||
|
|||
def output_charmap(i18n_file, map_name, map_function): |
|||
'''Output a LC_CTYPE character map section |
|||
|
|||
Example: |
|||
|
|||
toupper / |
|||
(<U0061>,<U0041>);(<U0062>,<U0042>);(<U0063>,<U0043>);(<U0064>,<U0044>);/ |
|||
… |
|||
(<U000118DC>,<U000118BC>);(<U000118DD>,<U000118BD>);/ |
|||
(<U000118DE>,<U000118BE>);(<U000118DF>,<U000118BF>) |
|||
''' |
|||
max_column = 75 |
|||
prefix = ' ' |
|||
line = prefix |
|||
map_string = '' |
|||
i18n_file.write('%s /\n' %map_name) |
|||
for code_point in sorted(UNICODE_ATTRIBUTES): |
|||
mapped = map_function(code_point) |
|||
if code_point != mapped: |
|||
if line.strip(): |
|||
line += ';' |
|||
map_string = '(' \ |
|||
+ ucs_symbol(code_point) \ |
|||
+ ',' \ |
|||
+ ucs_symbol(mapped) \ |
|||
+ ')' |
|||
if len(line+map_string) > max_column: |
|||
i18n_file.write(line+'/\n') |
|||
line = prefix |
|||
line += map_string |
|||
if line.strip(): |
|||
i18n_file.write(line+'\n') |
|||
i18n_file.write('\n') |
|||
|
|||
def verifications(): |
|||
'''Tests whether the is_* functions observe the known restrictions''' |
|||
for code_point in sorted(UNICODE_ATTRIBUTES): |
|||
# toupper restriction: "Only characters specified for the keywords |
|||
# lower and upper shall be specified. |
|||
if (to_upper(code_point) != code_point |
|||
and not (is_lower(code_point) or is_upper(code_point))): |
|||
sys.stderr.write( |
|||
('%(sym)s is not upper|lower ' |
|||
+ 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{ |
|||
'sym': ucs_symbol(code_point), |
|||
'c': code_point, |
|||
'uc': to_upper(code_point)}) |
|||
# tolower restriction: "Only characters specified for the keywords |
|||
# lower and upper shall be specified. |
|||
if (to_lower(code_point) != code_point |
|||
and not (is_lower(code_point) or is_upper(code_point))): |
|||
sys.stderr.write( |
|||
('%(sym)s is not upper|lower ' |
|||
+ 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{ |
|||
'sym': ucs_symbol(code_point), |
|||
'c': code_point, |
|||
'uc': to_lower(code_point)}) |
|||
# alpha restriction: "Characters classified as either upper or lower |
|||
# shall automatically belong to this class. |
|||
if ((is_lower(code_point) or is_upper(code_point)) |
|||
and not is_alpha(code_point)): |
|||
sys.stderr.write('%(sym)s is upper|lower but not alpha\n' %{ |
|||
'sym': ucs_symbol(code_point)}) |
|||
# alpha restriction: “No character specified for the keywords cntrl, |
|||
# digit, punct or space shall be specified.” |
|||
if (is_alpha(code_point) and is_cntrl(code_point)): |
|||
sys.stderr.write('%(sym)s is alpha and cntrl\n' %{ |
|||
'sym': ucs_symbol(code_point)}) |
|||
if (is_alpha(code_point) and is_digit(code_point)): |
|||
sys.stderr.write('%(sym)s is alpha and digit\n' %{ |
|||
'sym': ucs_symbol(code_point)}) |
|||
if (is_alpha(code_point) and is_punct(code_point)): |
|||
sys.stderr.write('%(sym)s is alpha and punct\n' %{ |
|||
'sym': ucs_symbol(code_point)}) |
|||
if (is_alpha(code_point) and is_space(code_point)): |
|||
sys.stderr.write('%(sym)s is alpha and space\n' %{ |
|||
'sym': ucs_symbol(code_point)}) |
|||
# space restriction: “No character specified for the keywords upper, |
|||
# lower, alpha, digit, graph or xdigit shall be specified.” |
|||
# upper, lower, alpha already checked above. |
|||
if (is_space(code_point) and is_digit(code_point)): |
|||
sys.stderr.write('%(sym)s is space and digit\n' %{ |
|||
'sym': ucs_symbol(code_point)}) |
|||
if (is_space(code_point) and is_graph(code_point)): |
|||
sys.stderr.write('%(sym)s is space and graph\n' %{ |
|||
'sym': ucs_symbol(code_point)}) |
|||
if (is_space(code_point) and is_xdigit(code_point)): |
|||
sys.stderr.write('%(sym)s is space and xdigit\n' %{ |
|||
'sym': ucs_symbol(code_point)}) |
|||
# cntrl restriction: “No character specified for the keywords upper, |
|||
# lower, alpha, digit, punct, graph, print or xdigit shall be |
|||
# specified.” upper, lower, alpha already checked above. |
|||
if (is_cntrl(code_point) and is_digit(code_point)): |
|||
sys.stderr.write('%(sym)s is cntrl and digit\n' %{ |
|||
'sym': ucs_symbol(code_point)}) |
|||
if (is_cntrl(code_point) and is_punct(code_point)): |
|||
sys.stderr.write('%(sym)s is cntrl and punct\n' %{ |
|||
'sym': ucs_symbol(code_point)}) |
|||
if (is_cntrl(code_point) and is_graph(code_point)): |
|||
sys.stderr.write('%(sym)s is cntrl and graph\n' %{ |
|||
'sym': ucs_symbol(code_point)}) |
|||
if (is_cntrl(code_point) and is_print(code_point)): |
|||
sys.stderr.write('%(sym)s is cntrl and print\n' %{ |
|||
'sym': ucs_symbol(code_point)}) |
|||
if (is_cntrl(code_point) and is_xdigit(code_point)): |
|||
sys.stderr.write('%(sym)s is cntrl and xdigit\n' %{ |
|||
'sym': ucs_symbol(code_point)}) |
|||
# punct restriction: “No character specified for the keywords upper, |
|||
# lower, alpha, digit, cntrl, xdigit or as the <space> character shall |
|||
# be specified.” upper, lower, alpha, cntrl already checked above. |
|||
if (is_punct(code_point) and is_digit(code_point)): |
|||
sys.stderr.write('%(sym)s is punct and digit\n' %{ |
|||
'sym': ucs_symbol(code_point)}) |
|||
if (is_punct(code_point) and is_xdigit(code_point)): |
|||
sys.stderr.write('%(sym)s is punct and xdigit\n' %{ |
|||
'sym': ucs_symbol(code_point)}) |
|||
if (is_punct(code_point) and code_point == 0x0020): |
|||
sys.stderr.write('%(sym)s is punct\n' %{ |
|||
'sym': ucs_symbol(code_point)}) |
|||
# graph restriction: “No character specified for the keyword cntrl |
|||
# shall be specified.” Already checked above. |
|||
|
|||
# print restriction: “No character specified for the keyword cntrl |
|||
# shall be specified.” Already checked above. |
|||
|
|||
# graph - print relation: differ only in the <space> character. |
|||
# How is this possible if there are more than one space character?! |
|||
# I think susv2/xbd/locale.html should speak of “space characters”, |
|||
# not “space character”. |
|||
if (is_print(code_point) |
|||
and not (is_graph(code_point) or is_space(code_point))): |
|||
sys.stderr.write('%(sym)s is print but not graph|<space>\n' %{ |
|||
'sym': ucs_symbol(code_point)}) |
|||
if (not is_print(code_point) |
|||
and (is_graph(code_point) or code_point == 0x0020)): |
|||
sys.stderr.write('%(sym)s is graph|<space> but not print\n' %{ |
|||
'sym': ucs_symbol(code_point)}) |
|||
|
|||
def read_input_file(filename): |
|||
'''Reads the original glibc i18n file to get the original head |
|||
and tail. |
|||
|
|||
We want to replace only the character classes in LC_CTYPE, and the |
|||
date stamp. All the rest of the i18n file should stay unchanged. |
|||
To avoid having to cut and paste the generated data into the |
|||
original file, it is helpful to read the original file here |
|||
to be able to generate a complete result file. |
|||
''' |
|||
head = tail = '' |
|||
with open(filename, mode='r') as i18n_file: |
|||
for line in i18n_file: |
|||
match = re.match( |
|||
r'^(?P<key>date\s+)(?P<value>"[0-9]{4}-[0-9]{2}-[0-9]{2}")', |
|||
line) |
|||
if match: |
|||
line = match.group('key') \ |
|||
+ '"{:s}"\n'.format(time.strftime('%Y-%m-%d')) |
|||
head = head + line |
|||
if line.startswith('LC_CTYPE'): |
|||
break |
|||
for line in i18n_file: |
|||
if line.startswith('translit_start'): |
|||
tail = line |
|||
break |
|||
for line in i18n_file: |
|||
tail = tail + line |
|||
return (head, tail) |
|||
|
|||
def output_head(i18n_file, unicode_version, head=''): |
|||
'''Write the header of the output file, i.e. the part of the file |
|||
before the “LC_CTYPE” line. |
|||
''' |
|||
if ARGS.input_file and head: |
|||
i18n_file.write(head) |
|||
else: |
|||
i18n_file.write('escape_char /\n') |
|||
i18n_file.write('comment_char %\n') |
|||
i18n_file.write('\n') |
|||
i18n_file.write('% Generated automatically by ' |
|||
+ 'gen_unicode_ctype.py ' |
|||
+ 'for Unicode {:s}.\n'.format(unicode_version)) |
|||
i18n_file.write('\n') |
|||
i18n_file.write('LC_IDENTIFICATION\n') |
|||
i18n_file.write('title "Unicode {:s} FDCC-set"\n'.format( |
|||
unicode_version)) |
|||
i18n_file.write('source "UnicodeData.txt, ' |
|||
+ 'DerivedCoreProperties.txt"\n') |
|||
i18n_file.write('address ""\n') |
|||
i18n_file.write('contact ""\n') |
|||
i18n_file.write('email "bug-glibc-locales@gnu.org"\n') |
|||
i18n_file.write('tel ""\n') |
|||
i18n_file.write('fax ""\n') |
|||
i18n_file.write('language ""\n') |
|||
i18n_file.write('territory "Earth"\n') |
|||
i18n_file.write('revision "{:s}"\n'.format(unicode_version)) |
|||
i18n_file.write('date "{:s}"\n'.format( |
|||
time.strftime('%Y-%m-%d'))) |
|||
i18n_file.write('category "unicode:2014";LC_CTYPE\n') |
|||
i18n_file.write('END LC_IDENTIFICATION\n') |
|||
i18n_file.write('\n') |
|||
i18n_file.write('LC_CTYPE\n') |
|||
|
|||
def output_tail(i18n_file, tail=''): |
|||
'''Write the tail of the output file, i.e. the part of the file |
|||
after the last “LC_CTYPE” character class. |
|||
''' |
|||
if ARGS.input_file and tail: |
|||
i18n_file.write(tail) |
|||
else: |
|||
i18n_file.write('END LC_CTYPE\n') |
|||
|
|||
def output_tables(i18n_file, unicode_version): |
|||
'''Write the new LC_CTYPE character classes to the output file''' |
|||
i18n_file.write('% The following is the 14652 i18n fdcc-set ' |
|||
+ 'LC_CTYPE category.\n') |
|||
i18n_file.write('% It covers Unicode version {:s}.\n'.format( |
|||
unicode_version)) |
|||
i18n_file.write('% The character classes and mapping tables were ' |
|||
+ 'automatically\n') |
|||
i18n_file.write('% generated using the gen_unicode_ctype.py ' |
|||
+ 'program.\n\n') |
|||
i18n_file.write('% The "upper" class reflects the uppercase ' |
|||
+ 'characters of class "alpha"\n') |
|||
output_charclass(i18n_file, 'upper', is_upper) |
|||
i18n_file.write('% The "lower" class reflects the lowercase ' |
|||
+ 'characters of class "alpha"\n') |
|||
output_charclass(i18n_file, 'lower', is_lower) |
|||
i18n_file.write('% The "alpha" class of the "i18n" FDCC-set is ' |
|||
+ 'reflecting\n') |
|||
i18n_file.write('% the recommendations in TR 10176 annex A\n') |
|||
output_charclass(i18n_file, 'alpha', is_alpha) |
|||
i18n_file.write('% The "digit" class must only contain the ' |
|||
+ 'BASIC LATIN digits, says ISO C 99\n') |
|||
i18n_file.write('% (sections 7.25.2.1.5 and 5.2.1).\n') |
|||
output_charclass(i18n_file, 'digit', is_digit) |
|||
i18n_file.write('% The "outdigit" information is by default ' |
|||
+ '"0" to "9". We don\'t have to\n') |
|||
i18n_file.write('% provide it here since localedef will fill ' |
|||
+ 'in the bits and it would\n') |
|||
i18n_file.write('% prevent locales copying this file define ' |
|||
+ 'their own values.\n') |
|||
i18n_file.write('% outdigit /\n') |
|||
i18n_file.write('% <U0030>..<U0039>\n\n') |
|||
# output_charclass(i18n_file, 'outdigit', is_outdigit) |
|||
output_charclass(i18n_file, 'space', is_space) |
|||
output_charclass(i18n_file, 'cntrl', is_cntrl) |
|||
output_charclass(i18n_file, 'punct', is_punct) |
|||
output_charclass(i18n_file, 'graph', is_graph) |
|||
output_charclass(i18n_file, 'print', is_print) |
|||
i18n_file.write('% The "xdigit" class must only contain the ' |
|||
+ 'BASIC LATIN digits and A-F, a-f,\n') |
|||
i18n_file.write('% says ISO C 99 ' |
|||
+ '(sections 7.25.2.1.12 and 6.4.4.1).\n') |
|||
output_charclass(i18n_file, 'xdigit', is_xdigit) |
|||
output_charclass(i18n_file, 'blank', is_blank) |
|||
output_charmap(i18n_file, 'toupper', to_upper) |
|||
output_charmap(i18n_file, 'tolower', to_lower) |
|||
output_charmap(i18n_file, 'map "totitle";', to_title) |
|||
i18n_file.write('% The "combining" class reflects ISO/IEC 10646-1 ' |
|||
+ 'annex B.1\n') |
|||
i18n_file.write('% That is, all combining characters (level 2+3).\n') |
|||
output_charclass(i18n_file, 'class "combining";', is_combining) |
|||
i18n_file.write('% The "combining_level3" class reflects ' |
|||
+ 'ISO/IEC 10646-1 annex B.2\n') |
|||
i18n_file.write('% That is, combining characters of level 3.\n') |
|||
output_charclass(i18n_file, |
|||
'class "combining_level3";', is_combining_level3) |
|||
|
|||
if __name__ == "__main__": |
|||
PARSER = argparse.ArgumentParser( |
|||
description=''' |
|||
Generate a Unicode conforming LC_CTYPE category from |
|||
UnicodeData.txt and DerivedCoreProperties.txt files. |
|||
''') |
|||
PARSER.add_argument( |
|||
'-u', '--unicode_data_file', |
|||
nargs='?', |
|||
type=str, |
|||
default='UnicodeData.txt', |
|||
help=('The UnicodeData.txt file to read, ' |
|||
+ 'default: %(default)s')) |
|||
PARSER.add_argument( |
|||
'-d', '--derived_core_properties_file', |
|||
nargs='?', |
|||
type=str, |
|||
default='DerivedCoreProperties.txt', |
|||
help=('The DerivedCoreProperties.txt file to read, ' |
|||
+ 'default: %(default)s')) |
|||
PARSER.add_argument( |
|||
'-i', '--input_file', |
|||
nargs='?', |
|||
type=str, |
|||
help='''The original glibc/localedata/locales/i18n file.''') |
|||
PARSER.add_argument( |
|||
'-o', '--output_file', |
|||
nargs='?', |
|||
type=str, |
|||
default='i18n.new', |
|||
help='''The file which shall contain the generated LC_CTYPE category, |
|||
default: %(default)s. If the original |
|||
glibc/localedata/locales/i18n has been given |
|||
as an option, all data from the original file |
|||
except the newly generated LC_CTYPE character |
|||
classes and the date stamp in |
|||
LC_IDENTIFICATION will be copied unchanged |
|||
into the output file. ''') |
|||
PARSER.add_argument( |
|||
'--unicode_version', |
|||
nargs='?', |
|||
required=True, |
|||
type=str, |
|||
help='The Unicode version of the input files used.') |
|||
ARGS = PARSER.parse_args() |
|||
|
|||
fill_attributes(ARGS.unicode_data_file) |
|||
fill_derived_core_properties(ARGS.derived_core_properties_file) |
|||
verifications() |
|||
HEAD = TAIL = '' |
|||
if ARGS.input_file: |
|||
(HEAD, TAIL) = read_input_file(ARGS.input_file) |
|||
with open(ARGS.output_file, mode='w') as I18N_FILE: |
|||
output_head(I18N_FILE, ARGS.unicode_version, head=HEAD) |
|||
output_tables(I18N_FILE, ARGS.unicode_version) |
|||
output_tail(I18N_FILE, tail=TAIL) |
|||
@ -0,0 +1,50 @@ |
|||
UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE |
|||
|
|||
Unicode Data Files include all data files under the directories |
|||
http://www.unicode.org/Public/, http://www.unicode.org/reports/, and |
|||
http://www.unicode.org/cldr/data/. Unicode Data Files do not include PDF |
|||
online code charts under the directory http://www.unicode.org/Public/. |
|||
Software includes any source code published in the Unicode Standard or under |
|||
the directories http://www.unicode.org/Public/, |
|||
http://www.unicode.org/reports/, and http://www.unicode.org/cldr/data/. |
|||
|
|||
NOTICE TO USER: Carefully read the following legal agreement. BY |
|||
DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S DATA FILES |
|||
("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), YOU UNEQUIVOCALLY ACCEPT, AND |
|||
AGREE TO BE BOUND BY, ALL OF THE TERMS AND CONDITIONS OF THIS AGREEMENT. IF |
|||
YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA |
|||
FILES OR SOFTWARE. |
|||
|
|||
COPYRIGHT AND PERMISSION NOTICE |
|||
|
|||
Copyright © 1991-2013 Unicode, Inc. All rights reserved. Distributed under |
|||
the Terms of Use in http://www.unicode.org/copyright.html. |
|||
|
|||
Permission is hereby granted, free of charge, to any person obtaining a |
|||
copy of the Unicode data files and any associated documentation (the "Data |
|||
Files") or Unicode software and any associated documentation (the "Software") |
|||
to deal in the Data Files or Software without restriction, including without |
|||
limitation the rights to use, copy, modify, merge, publish, distribute, and/or |
|||
sell copies of the Data Files or Software, and to permit persons to whom the |
|||
Data Files or Software are furnished to do so, provided that (a) the above |
|||
copyright notice(s) and this permission notice appear with all copies of the |
|||
Data Files or Software, (b) both the above copyright notice(s) and this |
|||
permission notice appear in associated documentation, and (c) there is clear |
|||
notice in each modified Data File or in the Software as well as in the |
|||
documentation associated with the Data File(s) or Software that the data or |
|||
software has been modified. |
|||
|
|||
THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY |
|||
KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
|||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD |
|||
PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN |
|||
THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL |
|||
DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR |
|||
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS |
|||
ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE |
|||
DATA FILES OR SOFTWARE. |
|||
|
|||
Except as contained in this notice, the name of a copyright holder shall |
|||
not be used in advertising or otherwise to promote the sale, use or other |
|||
dealings in these Data Files or Software without prior written authorization |
|||
of the copyright holder. |
|||
@ -0,0 +1,399 @@ |
|||
#!/usr/bin/python3 |
|||
# -*- coding: utf-8 -*- |
|||
# Copyright (C) 2014, 2015 Free Software Foundation, Inc. |
|||
# This file is part of the GNU C Library. |
|||
# |
|||
# The GNU C Library is free software; you can redistribute it and/or |
|||
# modify it under the terms of the GNU Lesser General Public |
|||
# License as published by the Free Software Foundation; either |
|||
# version 2.1 of the License, or (at your option) any later version. |
|||
# |
|||
# The GNU C Library is distributed in the hope that it will be useful, |
|||
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|||
# Lesser General Public License for more details. |
|||
# |
|||
# You should have received a copy of the GNU Lesser General Public |
|||
# License along with the GNU C Library; if not, see |
|||
# <http://www.gnu.org/licenses/>. |
|||
|
|||
''' |
|||
This script is useful for checking backward compatibility of newly |
|||
generated UTF-8 file from utf8_gen.py script |
|||
|
|||
To see how this script is used, call it with the “-h” option: |
|||
|
|||
$ ./utf8_compatibility.py -h |
|||
… prints usage message … |
|||
''' |
|||
|
|||
import sys |
|||
import re |
|||
import argparse |
|||
|
|||
# Dictionary holding the entire contents of the UnicodeData.txt file |
|||
# |
|||
# Contents of this dictionary look like this: |
|||
# |
|||
# {0: {'category': 'Cc', |
|||
# 'title': None, |
|||
# 'digit': '', |
|||
# 'name': '<control>', |
|||
# 'bidi': 'BN', |
|||
# 'combining': '0', |
|||
# 'comment': '', |
|||
# 'oldname': 'NULL', |
|||
# 'decomposition': '', |
|||
# 'upper': None, |
|||
# 'mirrored': 'N', |
|||
# 'lower': None, |
|||
# 'decdigit': '', |
|||
# 'numeric': ''}, |
|||
# … |
|||
# } |
|||
UNICODE_ATTRIBUTES = {} |
|||
|
|||
# Dictionary holding the entire contents of the EastAsianWidths.txt file |
|||
# |
|||
# Contents of this dictionary look like this: |
|||
# |
|||
# {0: 'N', … , 45430: 'W', …} |
|||
EAST_ASIAN_WIDTHS = {} |
|||
|
|||
def fill_attribute(code_point, fields): |
|||
'''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields. |
|||
|
|||
One entry in the UNICODE_ATTRIBUTES dictionary represents one line |
|||
in the UnicodeData.txt file. |
|||
|
|||
''' |
|||
UNICODE_ATTRIBUTES[code_point] = { |
|||
'name': fields[1], # Character name |
|||
'category': fields[2], # General category |
|||
'combining': fields[3], # Canonical combining classes |
|||
'bidi': fields[4], # Bidirectional category |
|||
'decomposition': fields[5], # Character decomposition mapping |
|||
'decdigit': fields[6], # Decimal digit value |
|||
'digit': fields[7], # Digit value |
|||
'numeric': fields[8], # Numeric value |
|||
'mirrored': fields[9], # mirrored |
|||
'oldname': fields[10], # Old Unicode 1.0 name |
|||
'comment': fields[11], # comment |
|||
# Uppercase mapping |
|||
'upper': int(fields[12], 16) if fields[12] else None, |
|||
# Lowercase mapping |
|||
'lower': int(fields[13], 16) if fields[13] else None, |
|||
# Titlecase mapping |
|||
'title': int(fields[14], 16) if fields[14] else None, |
|||
} |
|||
|
|||
def fill_attributes(filename): |
|||
'''Stores the entire contents of the UnicodeData.txt file |
|||
in the UNICODE_ATTRIBUTES dictionary. |
|||
|
|||
A typical line for a single code point in UnicodeData.txt looks |
|||
like this: |
|||
|
|||
0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061; |
|||
|
|||
Code point ranges are indicated by pairs of lines like this: |
|||
|
|||
4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;; |
|||
9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;; |
|||
''' |
|||
with open(filename, mode='r') as unicode_data_file: |
|||
fields_start = [] |
|||
for line in unicode_data_file: |
|||
fields = line.strip().split(';') |
|||
if len(fields) != 15: |
|||
sys.stderr.write( |
|||
'short line in file "%(f)s": %(l)s\n' %{ |
|||
'f': filename, 'l': line}) |
|||
exit(1) |
|||
if fields[2] == 'Cs': |
|||
# Surrogates are UTF-16 artefacts, |
|||
# not real characters. Ignore them. |
|||
fields_start = [] |
|||
continue |
|||
if fields[1].endswith(', First>'): |
|||
fields_start = fields |
|||
fields_start[1] = fields_start[1].split(',')[0][1:] |
|||
continue |
|||
if fields[1].endswith(', Last>'): |
|||
fields[1] = fields[1].split(',')[0][1:] |
|||
if fields[1:] != fields_start[1:]: |
|||
sys.stderr.write( |
|||
'broken code point range in file "%(f)s": %(l)s\n' %{ |
|||
'f': filename, 'l': line}) |
|||
exit(1) |
|||
for code_point in range( |
|||
int(fields_start[0], 16), |
|||
int(fields[0], 16)+1): |
|||
fill_attribute(code_point, fields) |
|||
fields_start = [] |
|||
continue |
|||
fill_attribute(int(fields[0], 16), fields) |
|||
fields_start = [] |
|||
|
|||
def fill_east_asian_widths(filename): |
|||
'''Stores the entire contents of the EastAsianWidths.txt file |
|||
in the EAST_ASIAN_WIDTHS dictionary. |
|||
|
|||
Lines in EastAsianWidths.txt are either a code point range like |
|||
this: |
|||
|
|||
9FCD..9FFF;W # Cn [51] <reserved-9FCD>..<reserved-9FFF> |
|||
|
|||
or a single code point like this: |
|||
|
|||
A015;W # Lm YI SYLLABLE WU |
|||
''' |
|||
with open(filename, mode='r') as east_asian_widths_file: |
|||
for line in east_asian_widths_file: |
|||
match = re.match( |
|||
r'^(?P<codepoint1>[0-9A-F]{4,6})' |
|||
+r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?' |
|||
+r'\s*;\s*(?P<property>[a-zA-Z]+)', |
|||
line) |
|||
if not match: |
|||
continue |
|||
start = match.group('codepoint1') |
|||
end = match.group('codepoint2') |
|||
if not end: |
|||
end = start |
|||
for code_point in range(int(start, 16), int(end, 16)+1): |
|||
EAST_ASIAN_WIDTHS[code_point] = match.group('property') |
|||
|
|||
def ucs_symbol(code_point): |
|||
'''Return the UCS symbol string for a Unicode character.''' |
|||
if code_point < 0x10000: |
|||
return '<U{:04X}>'.format(code_point) |
|||
else: |
|||
return '<U{:08X}>'.format(code_point) |
|||
|
|||
def create_charmap_dictionary(file_name): |
|||
'''Create a dictionary for all code points found in the CHARMAP |
|||
section of a file |
|||
''' |
|||
with open(file_name, mode='r') as utf8_file: |
|||
charmap_dictionary = {} |
|||
for line in utf8_file: |
|||
if line.startswith('CHARMAP'): |
|||
break |
|||
for line in utf8_file: |
|||
if line.startswith('END CHARMAP'): |
|||
return charmap_dictionary |
|||
if line.startswith('%'): |
|||
continue |
|||
match = re.match( |
|||
r'^<U(?P<codepoint1>[0-9A-F]{4,8})>' |
|||
+r'(:?\.\.<U(?P<codepoint2>[0-9-A-F]{4,8})>)?' |
|||
+r'\s+(?P<hexutf8>(/x[0-9a-f]{2}){1,4})', |
|||
line) |
|||
if not match: |
|||
continue |
|||
codepoint1 = match.group('codepoint1') |
|||
codepoint2 = match.group('codepoint2') |
|||
if not codepoint2: |
|||
codepoint2 = codepoint1 |
|||
for i in range(int(codepoint1, 16), |
|||
int(codepoint2, 16) + 1): |
|||
charmap_dictionary[i] = match.group('hexutf8') |
|||
sys.stderr.write('No “CHARMAP” or no “END CHARMAP” found in %s\n' |
|||
%file_name) |
|||
exit(1) |
|||
|
|||
def check_charmap(original_file_name, new_file_name): |
|||
'''Report differences in the CHARMAP section between the old and the |
|||
new file |
|||
''' |
|||
print('************************************************************') |
|||
print('Report on CHARMAP:') |
|||
ocharmap = create_charmap_dictionary(original_file_name) |
|||
ncharmap = create_charmap_dictionary(new_file_name) |
|||
print('------------------------------------------------------------') |
|||
print('Total removed characters in newly generated CHARMAP: %d' |
|||
%len(set(ocharmap)-set(ncharmap))) |
|||
if ARGS.show_missing_characters: |
|||
for key in sorted(set(ocharmap)-set(ncharmap)): |
|||
print('removed: {:s} {:s} {:s}'.format( |
|||
ucs_symbol(key), |
|||
ocharmap[key], |
|||
UNICODE_ATTRIBUTES[key]['name'] \ |
|||
if key in UNICODE_ATTRIBUTES else None)) |
|||
print('------------------------------------------------------------') |
|||
changed_charmap = {} |
|||
for key in set(ocharmap).intersection(set(ncharmap)): |
|||
if ocharmap[key] != ncharmap[key]: |
|||
changed_charmap[key] = (ocharmap[key], ncharmap[key]) |
|||
print('Total changed characters in newly generated CHARMAP: %d' |
|||
%len(changed_charmap)) |
|||
if ARGS.show_changed_characters: |
|||
for key in sorted(changed_charmap): |
|||
print('changed: {:s} {:s}->{:s} {:s}'.format( |
|||
ucs_symbol(key), |
|||
changed_charmap[key][0], |
|||
changed_charmap[key][1], |
|||
UNICODE_ATTRIBUTES[key]['name'] \ |
|||
if key in UNICODE_ATTRIBUTES else None)) |
|||
print('------------------------------------------------------------') |
|||
print('Total added characters in newly generated CHARMAP: %d' |
|||
%len(set(ncharmap)-set(ocharmap))) |
|||
if ARGS.show_added_characters: |
|||
for key in sorted(set(ncharmap)-set(ocharmap)): |
|||
print('added: {:s} {:s} {:s}'.format( |
|||
ucs_symbol(key), |
|||
ncharmap[key], |
|||
UNICODE_ATTRIBUTES[key]['name'] \ |
|||
if key in UNICODE_ATTRIBUTES else None)) |
|||
|
|||
def create_width_dictionary(file_name): |
|||
'''Create a dictionary for all code points found in the WIDTH |
|||
section of a file |
|||
''' |
|||
with open(file_name, mode='r') as utf8_file: |
|||
width_dictionary = {} |
|||
for line in utf8_file: |
|||
if line.startswith('WIDTH'): |
|||
break |
|||
for line in utf8_file: |
|||
if line.startswith('END WIDTH'): |
|||
return width_dictionary |
|||
match = re.match( |
|||
r'^<U(?P<codepoint1>[0-9A-F]{4,8})>' |
|||
+r'(:?\.\.\.<U(?P<codepoint2>[0-9-A-F]{4,8})>)?' |
|||
+r'\s+(?P<width>[02])', |
|||
line) |
|||
if not match: |
|||
continue |
|||
codepoint1 = match.group('codepoint1') |
|||
codepoint2 = match.group('codepoint2') |
|||
if not codepoint2: |
|||
codepoint2 = codepoint1 |
|||
for i in range(int(codepoint1, 16), |
|||
int(codepoint2, 16) + 1): |
|||
width_dictionary[i] = int(match.group('width')) |
|||
sys.stderr.write('No “WIDTH” or no “END WIDTH” found in %s\n' %file) |
|||
|
|||
def check_width(original_file_name, new_file_name): |
|||
'''Report differences in the WIDTH section between the old and the new |
|||
file |
|||
''' |
|||
print('************************************************************') |
|||
print('Report on WIDTH:') |
|||
owidth = create_width_dictionary(original_file_name) |
|||
nwidth = create_width_dictionary(new_file_name) |
|||
print('------------------------------------------------------------') |
|||
print('Total removed characters in newly generated WIDTH: %d' |
|||
%len(set(owidth)-set(nwidth))) |
|||
print('(Characters not in WIDTH get width 1 by default, ' |
|||
+ 'i.e. these have width 1 now.)') |
|||
if ARGS.show_missing_characters: |
|||
for key in sorted(set(owidth)-set(nwidth)): |
|||
print('removed: {:s} '.format(ucs_symbol(key)) |
|||
+ '{:d} : '.format(owidth[key]) |
|||
+ 'eaw={:s} '.format( |
|||
EAST_ASIAN_WIDTHS[key] |
|||
if key in EAST_ASIAN_WIDTHS else None) |
|||
+ 'category={:2s} '.format( |
|||
UNICODE_ATTRIBUTES[key]['category'] |
|||
if key in UNICODE_ATTRIBUTES else None) |
|||
+ 'bidi={:3s} '.format( |
|||
UNICODE_ATTRIBUTES[key]['bidi'] |
|||
if key in UNICODE_ATTRIBUTES else None) |
|||
+ 'name={:s}'.format( |
|||
UNICODE_ATTRIBUTES[key]['name'] |
|||
if key in UNICODE_ATTRIBUTES else None)) |
|||
print('------------------------------------------------------------') |
|||
changed_width = {} |
|||
for key in set(owidth).intersection(set(nwidth)): |
|||
if owidth[key] != nwidth[key]: |
|||
changed_width[key] = (owidth[key], nwidth[key]) |
|||
print('Total changed characters in newly generated WIDTH: %d' |
|||
%len(changed_width)) |
|||
if ARGS.show_changed_characters: |
|||
for key in sorted(changed_width): |
|||
print('changed width: {:s} '.format(ucs_symbol(key)) |
|||
+ '{:d}->{:d} : '.format(changed_width[key][0], |
|||
changed_width[key][1]) |
|||
+ 'eaw={:s} '.format( |
|||
EAST_ASIAN_WIDTHS[key] |
|||
if key in EAST_ASIAN_WIDTHS else None) |
|||
+ 'category={:2s} '.format( |
|||
UNICODE_ATTRIBUTES[key]['category'] |
|||
if key in UNICODE_ATTRIBUTES else None) |
|||
+ 'bidi={:3s} '.format( |
|||
UNICODE_ATTRIBUTES[key]['bidi'] |
|||
if key in UNICODE_ATTRIBUTES else None) |
|||
+ 'name={:s}'.format( |
|||
UNICODE_ATTRIBUTES[key]['name'] |
|||
if key in UNICODE_ATTRIBUTES else None)) |
|||
print('------------------------------------------------------------') |
|||
print('Total added characters in newly generated WIDTH: %d' |
|||
%len(set(nwidth)-set(owidth))) |
|||
print('(Characters not in WIDTH get width 1 by default, ' |
|||
+ 'i.e. these had width 1 before.)') |
|||
if ARGS.show_added_characters: |
|||
for key in sorted(set(nwidth)-set(owidth)): |
|||
print('added: {:s} '.format(ucs_symbol(key)) |
|||
+ '{:d} : '.format(nwidth[key]) |
|||
+ 'eaw={:s} '.format( |
|||
EAST_ASIAN_WIDTHS[key] |
|||
if key in EAST_ASIAN_WIDTHS else None) |
|||
+ 'category={:2s} '.format( |
|||
UNICODE_ATTRIBUTES[key]['category'] |
|||
if key in UNICODE_ATTRIBUTES else None) |
|||
+ 'bidi={:3s} '.format( |
|||
UNICODE_ATTRIBUTES[key]['bidi'] |
|||
if key in UNICODE_ATTRIBUTES else None) |
|||
+ 'name={:s}'.format( |
|||
UNICODE_ATTRIBUTES[key]['name'] |
|||
if key in UNICODE_ATTRIBUTES else None)) |
|||
|
|||
if __name__ == "__main__": |
|||
PARSER = argparse.ArgumentParser( |
|||
description=''' |
|||
Compare the contents of LC_CTYPE in two files and check for errors. |
|||
''') |
|||
PARSER.add_argument( |
|||
'-o', '--old_utf8_file', |
|||
nargs='?', |
|||
required=True, |
|||
type=str, |
|||
help='The old UTF-8 file.') |
|||
PARSER.add_argument( |
|||
'-n', '--new_utf8_file', |
|||
nargs='?', |
|||
required=True, |
|||
type=str, |
|||
help='The new UTF-8 file.') |
|||
PARSER.add_argument( |
|||
'-u', '--unicode_data_file', |
|||
nargs='?', |
|||
type=str, |
|||
help='The UnicodeData.txt file to read.') |
|||
PARSER.add_argument( |
|||
'-e', '--east_asian_width_file', |
|||
nargs='?', |
|||
type=str, |
|||
help='The EastAsianWidth.txt file to read.') |
|||
PARSER.add_argument( |
|||
'-a', '--show_added_characters', |
|||
action='store_true', |
|||
help='Show characters which were added in detail.') |
|||
PARSER.add_argument( |
|||
'-m', '--show_missing_characters', |
|||
action='store_true', |
|||
help='Show characters which were removed in detail.') |
|||
PARSER.add_argument( |
|||
'-c', '--show_changed_characters', |
|||
action='store_true', |
|||
help='Show characters whose width was changed in detail.') |
|||
ARGS = PARSER.parse_args() |
|||
|
|||
if ARGS.unicode_data_file: |
|||
fill_attributes(ARGS.unicode_data_file) |
|||
if ARGS.east_asian_width_file: |
|||
fill_east_asian_widths(ARGS.east_asian_width_file) |
|||
check_charmap(ARGS.old_utf8_file, ARGS.new_utf8_file) |
|||
check_width(ARGS.old_utf8_file, ARGS.new_utf8_file) |
|||
@ -0,0 +1,286 @@ |
|||
#!/usr/bin/python3 |
|||
# -*- coding: utf-8 -*- |
|||
# Copyright (C) 2014, 2015 Free Software Foundation, Inc. |
|||
# This file is part of the GNU C Library. |
|||
# |
|||
# The GNU C Library is free software; you can redistribute it and/or |
|||
# modify it under the terms of the GNU Lesser General Public |
|||
# License as published by the Free Software Foundation; either |
|||
# version 2.1 of the License, or (at your option) any later version. |
|||
# |
|||
# The GNU C Library is distributed in the hope that it will be useful, |
|||
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
|||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|||
# Lesser General Public License for more details. |
|||
# |
|||
# You should have received a copy of the GNU Lesser General Public |
|||
# License along with the GNU C Library; if not, see |
|||
# <http://www.gnu.org/licenses/>. |
|||
|
|||
'''glibc/localedata/charmaps/UTF-8 file generator script |
|||
|
|||
This script generates a glibc/localedata/charmaps/UTF-8 file |
|||
from Unicode data. |
|||
|
|||
Usage: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt |
|||
|
|||
It will output UTF-8 file |
|||
''' |
|||
|
|||
import sys |
|||
import re |
|||
|
|||
# Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book, |
|||
# sections 3.11 and 4.4. |
|||
|
|||
jamo_initial_short_name = [ |
|||
'G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '', 'J', 'JJ', |
|||
'C', 'K', 'T', 'P', 'H' |
|||
] |
|||
|
|||
jamo_medial_short_name = [ |
|||
'A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O', 'WA', 'WAE', 'OE', |
|||
'YO', 'U', 'WEO', 'WE', 'WI', 'YU', 'EU', 'YI', 'I' |
|||
] |
|||
|
|||
jamo_final_short_name = [ |
|||
'', 'G', 'GG', 'GS', 'N', 'NI', 'NH', 'D', 'L', 'LG', 'LM', 'LB', 'LS', |
|||
'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS', 'NG', 'J', 'C', 'K', 'T', |
|||
'P', 'H' |
|||
] |
|||
|
|||
def ucs_symbol(code_point): |
|||
'''Return the UCS symbol string for a Unicode character.''' |
|||
if code_point < 0x10000: |
|||
return '<U{:04X}>'.format(code_point) |
|||
else: |
|||
return '<U{:08X}>'.format(code_point) |
|||
|
|||
def process_range(start, end, outfile, name): |
|||
'''Writes a range of code points into the CHARMAP section of the |
|||
output file |
|||
|
|||
''' |
|||
if 'Hangul Syllable' in name: |
|||
# from glibc/localedata/ChangeLog: |
|||
# |
|||
# 2000-09-24 Bruno Haible <haible@clisp.cons.org> |
|||
# * charmaps/UTF-8: Expand <Hangul Syllable> and <Private Use> ranges, |
|||
# so they become printable and carry a width. Comment out surrogate |
|||
# ranges. Add a WIDTH table |
|||
# |
|||
# So we expand the Hangul Syllables here: |
|||
for i in range(int(start, 16), int(end, 16)+1 ): |
|||
index2, index3 = divmod(i - 0xaC00, 28) |
|||
index1, index2 = divmod(index2, 21) |
|||
hangul_syllable_name = 'HANGUL SYLLABLE ' \ |
|||
+ jamo_initial_short_name[index1] \ |
|||
+ jamo_medial_short_name[index2] \ |
|||
+ jamo_final_short_name[index3] |
|||
outfile.write('{:<11s} {:<12s} {:s}\n'.format( |
|||
ucs_symbol(i), convert_to_hex(i), |
|||
hangul_syllable_name)) |
|||
return |
|||
# UnicodeData.txt file has contains code point ranges like this: |
|||
# |
|||
# 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; |
|||
# 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; |
|||
# |
|||
# The glibc UTF-8 file splits ranges like these into shorter |
|||
# ranges of 64 code points each: |
|||
# |
|||
# <U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A> |
|||
# … |
|||
# <U4D80>..<U4DB5> /xe4/xb6/x80 <CJK Ideograph Extension A> |
|||
for i in range(int(start, 16), int(end, 16), 64 ): |
|||
if i > (int(end, 16)-64): |
|||
outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format( |
|||
ucs_symbol(i), |
|||
ucs_symbol(int(end,16)), |
|||
convert_to_hex(i), |
|||
name)) |
|||
break |
|||
outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format( |
|||
ucs_symbol(i), |
|||
ucs_symbol(i+63), |
|||
convert_to_hex(i), |
|||
name)) |
|||
|
|||
def process_charmap(flines, outfile): |
|||
'''This function takes an array which contains *all* lines of |
|||
of UnicodeData.txt and write lines to outfile as used in the |
|||
|
|||
CHARMAP |
|||
… |
|||
END CHARMAP |
|||
|
|||
section of the UTF-8 file in glibc/localedata/charmaps/UTF-8. |
|||
|
|||
Samples for input lines: |
|||
|
|||
0010;<control>;Cc;0;BN;;;;;N;DATA LINK ESCAPE;;;; |
|||
3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; |
|||
4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; |
|||
D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;; |
|||
DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;; |
|||
100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;; |
|||
10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;; |
|||
|
|||
Samples for output lines (Unicode-Value UTF-8-HEX Unicode-Char-Name): |
|||
|
|||
<U0010> /x10 DATA LINK ESCAPE |
|||
<U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A> |
|||
%<UD800> /xed/xa0/x80 <Non Private Use High Surrogate, First> |
|||
%<UDB7F> /xed/xad/xbf <Non Private Use High Surrogate, Last> |
|||
<U0010FFC0>..<U0010FFFD> /xf4/x8f/xbf/x80 <Plane 16 Private Use> |
|||
|
|||
''' |
|||
fields_start = [] |
|||
for line in flines: |
|||
fields = line.split(";") |
|||
# Some characters have “<control>” as their name. We try to |
|||
# use the “Unicode 1.0 Name” (10th field in |
|||
# UnicodeData.txt) for them. |
|||
# |
|||
# The Characters U+0080, U+0081, U+0084 and U+0099 have |
|||
# “<control>” as their name but do not even have aa |
|||
# ”Unicode 1.0 Name”. We could write code to take their |
|||
# alternate names from NameAliases.txt. |
|||
if fields[1] == "<control>" and fields[10]: |
|||
fields[1] = fields[10] |
|||
# Handling code point ranges like: |
|||
# |
|||
# 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; |
|||
# 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; |
|||
if fields[1].endswith(', First>') and not 'Surrogate,' in fields[1]: |
|||
fields_start = fields |
|||
continue |
|||
if fields[1].endswith(', Last>') and not 'Surrogate,' in fields[1]: |
|||
process_range(fields_start[0], fields[0], |
|||
outfile, fields[1][:-7]+'>') |
|||
fields_start = [] |
|||
continue |
|||
fields_start = [] |
|||
if 'Surrogate,' in fields[1]: |
|||
# Comment out the surrogates in the UTF-8 file. |
|||
# One could of course skip them completely but |
|||
# the original UTF-8 file in glibc had them as |
|||
# comments, so we keep these comment lines. |
|||
outfile.write('%') |
|||
outfile.write('{:<11s} {:<12s} {:s}\n'.format( |
|||
ucs_symbol(int(fields[0], 16)), |
|||
convert_to_hex(int(fields[0], 16)), |
|||
fields[1])) |
|||
|
|||
def convert_to_hex(code_point): |
|||
'''Converts a code point to a hexadecimal UTF-8 representation |
|||
like /x**/x**/x**.''' |
|||
# Getting UTF8 of Unicode characters. |
|||
# In Python3, .encode('UTF-8') does not work for |
|||
# surrogates. Therefore, we use this conversion table |
|||
surrogates = { |
|||
0xD800: '/xed/xa0/x80', |
|||
0xDB7F: '/xed/xad/xbf', |
|||
0xDB80: '/xed/xae/x80', |
|||
0xDBFF: '/xed/xaf/xbf', |
|||
0xDC00: '/xed/xb0/x80', |
|||
0xDFFF: '/xed/xbf/xbf', |
|||
} |
|||
if code_point in surrogates: |
|||
return surrogates[code_point] |
|||
return ''.join([ |
|||
'/x{:02x}'.format(c) for c in chr(code_point).encode('UTF-8') |
|||
]) |
|||
|
|||
def write_header_charmap(outfile): |
|||
'''Write the header on top of the CHARMAP section to the output file''' |
|||
outfile.write("<code_set_name> UTF-8\n") |
|||
outfile.write("<comment_char> %\n") |
|||
outfile.write("<escape_char> /\n") |
|||
outfile.write("<mb_cur_min> 1\n") |
|||
outfile.write("<mb_cur_max> 6\n\n") |
|||
outfile.write("% CHARMAP generated using utf8_gen.py\n") |
|||
outfile.write("% alias ISO-10646/UTF-8\n") |
|||
outfile.write("CHARMAP\n") |
|||
|
|||
def write_header_width(outfile): |
|||
'''Writes the header on top of the WIDTH section to the output file''' |
|||
outfile.write('% Character width according to Unicode 7.0.0.\n') |
|||
outfile.write('% - Default width is 1.\n') |
|||
outfile.write('% - Double-width characters have width 2; generated from\n') |
|||
outfile.write('% "grep \'^[^;]*;[WF]\' EastAsianWidth.txt"\n') |
|||
outfile.write('% - Non-spacing characters have width 0; ' |
|||
+ 'generated from PropList.txt or\n') |
|||
outfile.write('% "grep \'^[^;]*;[^;]*;[^;]*;[^;]*;NSM;\' ' |
|||
+ 'UnicodeData.txt"\n') |
|||
outfile.write('% - Format control characters have width 0; ' |
|||
+ 'generated from\n') |
|||
outfile.write("% \"grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt\"\n") |
|||
# Not needed covered by Cf |
|||
# outfile.write("% - Zero width characters have width 0; generated from\n") |
|||
# outfile.write("% \"grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt\"\n") |
|||
outfile.write("WIDTH\n") |
|||
|
|||
def process_width(outfile, ulines, elines): |
|||
'''ulines are lines from UnicodeData.txt, elines are lines from |
|||
EastAsianWidth.txt |
|||
|
|||
''' |
|||
width_dict = {} |
|||
for line in ulines: |
|||
fields = line.split(";") |
|||
if fields[4] == "NSM" or fields[2] == "Cf": |
|||
width_dict[int(fields[0], 16)] = ucs_symbol( |
|||
int(fields[0], 16)) + '\t0' |
|||
|
|||
for line in elines: |
|||
# If an entry in EastAsianWidth.txt is found, it overrides entries in |
|||
# UnicodeData.txt: |
|||
fields = line.split(";") |
|||
if not '..' in fields[0]: |
|||
width_dict[int(fields[0], 16)] = ucs_symbol( |
|||
int(fields[0], 16)) + '\t2' |
|||
else: |
|||
code_points = fields[0].split("..") |
|||
for key in range(int(code_points[0], 16), |
|||
int(code_points[1], 16)+1): |
|||
if key in width_dict: |
|||
del width_dict[key] |
|||
width_dict[int(code_points[0], 16)] = '{:s}...{:s}\t2'.format( |
|||
ucs_symbol(int(code_points[0], 16)), |
|||
ucs_symbol(int(code_points[1], 16))) |
|||
|
|||
for key in sorted(width_dict): |
|||
outfile.write(width_dict[key]+'\n') |
|||
|
|||
if __name__ == "__main__": |
|||
if len(sys.argv) < 3: |
|||
print("USAGE: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt") |
|||
else: |
|||
with open(sys.argv[1], mode='r') as UNIDATA_FILE: |
|||
UNICODE_DATA_LINES = UNIDATA_FILE.readlines() |
|||
with open(sys.argv[2], mode='r') as EAST_ASIAN_WIDTH_FILE: |
|||
EAST_ASIAN_WIDTH_LINES = [] |
|||
for LINE in EAST_ASIAN_WIDTH_FILE: |
|||
# If characters from EastAasianWidth.txt which are from |
|||
# from reserved ranges (i.e. not yet assigned code points) |
|||
# are added to the WIDTH section of the UTF-8 file, then |
|||
# “make check” produces “Unknown Character” errors for |
|||
# these code points because such unassigned code points |
|||
# are not in the CHARMAP section of the UTF-8 file. |
|||
# |
|||
# Therefore, we skip all reserved code points when reading |
|||
# the EastAsianWidth.txt file. |
|||
if re.match(r'.*<reserved-.+>\.\.<reserved-.+>.*', LINE): |
|||
continue |
|||
if re.match(r'^[^;]*;[WF]', LINE): |
|||
EAST_ASIAN_WIDTH_LINES.append(LINE.strip()) |
|||
with open('UTF-8', mode='w') as OUTFILE: |
|||
# Processing UnicodeData.txt and write CHARMAP to UTF-8 file |
|||
write_header_charmap(OUTFILE) |
|||
process_charmap(UNICODE_DATA_LINES, OUTFILE) |
|||
OUTFILE.write("END CHARMAP\n\n") |
|||
# Processing EastAsianWidth.txt and write WIDTH to UTF-8 file |
|||
write_header_width(OUTFILE) |
|||
process_width(OUTFILE, UNICODE_DATA_LINES, EAST_ASIAN_WIDTH_LINES) |
|||
OUTFILE.write("END WIDTH\n") |
|||
Loading…
Reference in new issue