Python源码示例:unicodedata.lookup()
示例1
def replace_unicode(self, m):
"""Replace escapes."""
groups = m.groupdict()
esc = m.group(0)
if groups.get('fesc'):
value = m.group(0)
elif groups.get('format'):
value = ' '
elif groups.get('special'):
value = BACK_SLASH_TRANSLATION[esc]
elif groups.get('char'):
try:
value = chr(int(esc[2:], 16))
except Exception:
value = esc
elif groups.get('oct'):
value = chr(int(esc[1:], 8))
elif groups.get('name'):
try:
value = unicodedata.lookup(esc[3:-1])
except Exception:
value = esc
return value.replace('\x00', '\n')
示例2
def start_unichar(self, attr):
if 'name' in attr:
if 'code' in attr:
self._syntax_error('<unichar/> invalid with both name and code attributes')
try:
v = unicodedata.lookup(attr['name'])
except KeyError:
self._syntax_error('<unichar/> invalid name attribute\n"%s"' % ascii(attr['name']))
v = '\0'
elif 'code' in attr:
try:
v = int(eval(attr['code']))
v = chr(v) if isPy3 else unichr(v)
except:
self._syntax_error('<unichar/> invalid code attribute %s' % ascii(attr['code']))
v = '\0'
else:
v = None
if attr:
self._syntax_error('<unichar/> invalid attribute %s' % list(attr.keys())[0])
if v is not None:
self.handle_data(v)
self._push('unichar',_selfClosingTag='unichar')
示例3
def test_aliases(self):
# Check that the aliases defined in the NameAliases.txt file work.
# This should be updated when new aliases are added or the file
# should be downloaded and parsed instead. See #12753.
aliases = [
('LATIN CAPITAL LETTER GHA', 0x01A2),
('LATIN SMALL LETTER GHA', 0x01A3),
('KANNADA LETTER LLLA', 0x0CDE),
('LAO LETTER FO FON', 0x0E9D),
('LAO LETTER FO FAY', 0x0E9F),
('LAO LETTER RO', 0x0EA3),
('LAO LETTER LO', 0x0EA5),
('TIBETAN MARK BKA- SHOG GI MGO RGYAN', 0x0FD0),
('YI SYLLABLE ITERATION MARK', 0xA015),
('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET', 0xFE18),
('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS', 0x1D0C5)
]
for alias, codepoint in aliases:
self.checkletter(alias, chr(codepoint))
name = unicodedata.name(chr(codepoint))
self.assertNotEqual(name, alias)
self.assertEqual(unicodedata.lookup(alias),
unicodedata.lookup(name))
with self.assertRaises(KeyError):
unicodedata.ucd_3_2_0.lookup(alias)
示例4
def greek_letter_name_to_unicode(letter):
# type: (str) -> str
"""
Return a greek letter name as a Unicode character.
Examples
--------
Lamda -> Λ (Unicodedata library uses "lamda" for "lambda" :S!)
Omega -> Ω
omega -> ω
"""
return unicodedata.lookup(
"GREEK {case} LETTER {name}".format(
case="SMALL" if letter == letter.lower() else "CAPITAL", name=letter.upper()
)
)
示例5
def test_aliases(self):
# Check that the aliases defined in the NameAliases.txt file work.
# This should be updated when new aliases are added or the file
# should be downloaded and parsed instead. See #12753.
aliases = [
('LATIN CAPITAL LETTER GHA', 0x01A2),
('LATIN SMALL LETTER GHA', 0x01A3),
('KANNADA LETTER LLLA', 0x0CDE),
('LAO LETTER FO FON', 0x0E9D),
('LAO LETTER FO FAY', 0x0E9F),
('LAO LETTER RO', 0x0EA3),
('LAO LETTER LO', 0x0EA5),
('TIBETAN MARK BKA- SHOG GI MGO RGYAN', 0x0FD0),
('YI SYLLABLE ITERATION MARK', 0xA015),
('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET', 0xFE18),
('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS', 0x1D0C5)
]
for alias, codepoint in aliases:
self.checkletter(alias, chr(codepoint))
name = unicodedata.name(chr(codepoint))
self.assertNotEqual(name, alias)
self.assertEqual(unicodedata.lookup(alias),
unicodedata.lookup(name))
with self.assertRaises(KeyError):
unicodedata.ucd_3_2_0.lookup(alias)
示例6
def test_named_sequences_full(self):
# Check all the named sequences
url = ("http://www.pythontest.net/unicode/%s/NamedSequences.txt" %
unicodedata.unidata_version)
try:
testdata = support.open_urlresource(url, encoding="utf-8",
check=check_version)
except (OSError, HTTPException):
self.skipTest("Could not retrieve " + url)
self.addCleanup(testdata.close)
for line in testdata:
line = line.strip()
if not line or line.startswith('#'):
continue
seqname, codepoints = line.split(';')
codepoints = ''.join(chr(int(cp, 16)) for cp in codepoints.split())
self.assertEqual(unicodedata.lookup(seqname), codepoints)
with self.assertRaises(SyntaxError):
self.checkletter(seqname, None)
with self.assertRaises(KeyError):
unicodedata.ucd_3_2_0.lookup(seqname)
示例7
def start_unichar(self, attr):
if 'name' in attr:
if 'code' in attr:
self._syntax_error('<unichar/> invalid with both name and code attributes')
try:
v = unicodedata.lookup(attr['name']).encode('utf8')
except KeyError:
self._syntax_error('<unichar/> invalid name attribute\n"%s"' % name)
v = '\0'
elif 'code' in attr:
try:
v = unichr(int(eval(attr['code']))).encode('utf8')
except:
self._syntax_error('<unichar/> invalid code attribute %s' % attr['code'])
v = '\0'
else:
v = None
if attr:
self._syntax_error('<unichar/> invalid attribute %s' % attr.keys()[0])
if v is not None:
self.handle_data(v)
self._push(_selfClosingTag='unichar')
示例8
def test_aliases(self):
# Check that the aliases defined in the NameAliases.txt file work.
# This should be updated when new aliases are added or the file
# should be downloaded and parsed instead. See #12753.
aliases = [
('LATIN CAPITAL LETTER GHA', 0x01A2),
('LATIN SMALL LETTER GHA', 0x01A3),
('KANNADA LETTER LLLA', 0x0CDE),
('LAO LETTER FO FON', 0x0E9D),
('LAO LETTER FO FAY', 0x0E9F),
('LAO LETTER RO', 0x0EA3),
('LAO LETTER LO', 0x0EA5),
('TIBETAN MARK BKA- SHOG GI MGO RGYAN', 0x0FD0),
('YI SYLLABLE ITERATION MARK', 0xA015),
('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET', 0xFE18),
('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS', 0x1D0C5)
]
for alias, codepoint in aliases:
self.checkletter(alias, chr(codepoint))
name = unicodedata.name(chr(codepoint))
self.assertNotEqual(name, alias)
self.assertEqual(unicodedata.lookup(alias),
unicodedata.lookup(name))
with self.assertRaises(KeyError):
unicodedata.ucd_3_2_0.lookup(alias)
示例9
def test_named_sequences_full(self):
# Check all the named sequences
url = ("http://www.pythontest.net/unicode/%s/NamedSequences.txt" %
unicodedata.unidata_version)
try:
testdata = support.open_urlresource(url, encoding="utf-8",
check=check_version)
except (OSError, HTTPException):
self.skipTest("Could not retrieve " + url)
self.addCleanup(testdata.close)
for line in testdata:
line = line.strip()
if not line or line.startswith('#'):
continue
seqname, codepoints = line.split(';')
codepoints = ''.join(chr(int(cp, 16)) for cp in codepoints.split())
self.assertEqual(unicodedata.lookup(seqname), codepoints)
with self.assertRaises(SyntaxError):
self.checkletter(seqname, None)
with self.assertRaises(KeyError):
unicodedata.ucd_3_2_0.lookup(seqname)
示例10
def _greekletters(letterlist):
for l in letterlist:
ucharname = l.upper()
if ucharname == 'LAMBDA':
ucharname = 'LAMDA'
smallname = "GREEK SMALL LETTER "+ucharname
if ucharname == 'EPSILON':
smallname = "GREEK LUNATE EPSILON SYMBOL"
if ucharname == 'PHI':
smallname = "GREEK PHI SYMBOL"
_latex_specs_base['macros'].append(
MacroTextSpec(l, unicodedata.lookup(smallname))
)
_latex_specs_base['macros'].append(
MacroTextSpec(l[0].upper()+l[1:], unicodedata.lookup("GREEK CAPITAL LETTER "+ucharname))
)
示例11
def test_unicode_whitespace(self):
# Test for http://bugs.jython.org/issue2226
ws_re = re.compile(r'\s', re.UNICODE)
not_ws_re = re.compile(r'\S', re.UNICODE)
separator_categories = set(['Zl', 'Zp', 'Zs'])
separators = {chr(c) for c in [28, 29, 30, 31]}
special = set([
unicodedata.lookup('MONGOLIAN VOWEL SEPARATOR'),
u'\u0085', # NEXT LINE (NEL)
])
cpython_whitespace = set(' \t\n\r\f\v') | separators | special
for i in xrange(0xFFFF): # could test to sys.maxunicode, but does not appear to be necessary
if i >= 0xD800 and i <= 0xDFFF:
continue
c = unichr(i)
if c in cpython_whitespace or category(c) in separator_categories:
self.assertRegexpMatches(c, ws_re)
self.assertNotRegexpMatches(c, not_ws_re)
else:
self.assertNotRegexpMatches(c, ws_re)
self.assertRegexpMatches(c, not_ws_re)
示例12
def _greekletters(letterlist):
for l in letterlist:
ucharname = l.upper()
if (ucharname == 'LAMBDA'):
ucharname = 'LAMDA'
smallname = "GREEK SMALL LETTER "+ucharname;
if (ucharname == 'EPSILON'):
smallname = "GREEK LUNATE EPSILON SYMBOL"
if (ucharname == 'PHI'):
smallname = "GREEK PHI SYMBOL"
_default_macro_list.append(
(l, unicodedata.lookup(smallname))
);
_default_macro_list.append(
(l[0].upper()+l[1:], unicodedata.lookup("GREEK CAPITAL LETTER "+ucharname))
);
示例13
def test_unicode_whitespace(self):
# Test for http://bugs.jython.org/issue2226
ws_re = re.compile(r'\s', re.UNICODE)
not_ws_re = re.compile(r'\S', re.UNICODE)
separator_categories = set(['Zl', 'Zp', 'Zs'])
separators = {chr(c) for c in [28, 29, 30, 31]}
special = set([
unicodedata.lookup('MONGOLIAN VOWEL SEPARATOR'),
u'\u0085', # NEXT LINE (NEL)
])
cpython_whitespace = set(' \t\n\r\f\v') | separators | special
for i in xrange(0xFFFF): # could test to sys.maxunicode, but does not appear to be necessary
if i >= 0xD800 and i <= 0xDFFF:
continue
c = unichr(i)
if c in cpython_whitespace or category(c) in separator_categories:
self.assertRegexpMatches(c, ws_re)
self.assertNotRegexpMatches(c, not_ws_re)
else:
self.assertNotRegexpMatches(c, ws_re)
self.assertRegexpMatches(c, not_ws_re)
示例14
def test_unicode(self, sparse):
# See GH 6885 - get_dummies chokes on unicode values
import unicodedata
e = 'e'
eacute = unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE')
s = [e, eacute, eacute]
res = get_dummies(s, prefix='letter', sparse=sparse)
exp = DataFrame({'letter_e': [1, 0, 0],
u('letter_%s') % eacute: [0, 1, 1]},
dtype=np.uint8)
if sparse:
exp = exp.apply(pd.SparseArray, fill_value=0)
assert_frame_equal(res, exp)
示例15
def test_ascii_letters(self):
import unicodedata
for char in "".join(map(chr, xrange(ord("a"), ord("z")))):
name = "LATIN SMALL LETTER %s" % char.upper()
code = unicodedata.lookup(name)
self.assertEqual(unicodedata.name(code), name)
示例16
def test_bmp_characters(self):
import unicodedata
count = 0
for code in xrange(0x10000):
char = unichr(code)
name = unicodedata.name(char, None)
if name is not None:
self.assertEqual(unicodedata.lookup(name), char)
count += 1
示例17
def test_errors(self):
import unicodedata
self.assertRaises(TypeError, unicodedata.name)
self.assertRaises(TypeError, unicodedata.name, u'xx')
self.assertRaises(TypeError, unicodedata.lookup)
self.assertRaises(KeyError, unicodedata.lookup, u'unknown')
示例18
def test_unicode(self, sparse):
# See GH 6885 - get_dummies chokes on unicode values
import unicodedata
e = 'e'
eacute = unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE')
s = [e, eacute, eacute]
res = get_dummies(s, prefix='letter', sparse=sparse)
exp = DataFrame({'letter_e': [1, 0, 0],
u('letter_%s') % eacute: [0, 1, 1]},
dtype=np.uint8)
assert_frame_equal(res, exp)
示例19
def test_ascii_letters(self):
import unicodedata
for char in "".join(map(chr, xrange(ord("a"), ord("z")))):
name = "LATIN SMALL LETTER %s" % char.upper()
code = unicodedata.lookup(name)
self.assertEqual(unicodedata.name(code), name)
示例20
def test_bmp_characters(self):
import unicodedata
count = 0
for code in xrange(0x10000):
char = unichr(code)
name = unicodedata.name(char, None)
if name is not None:
self.assertEqual(unicodedata.lookup(name), char)
count += 1
示例21
def test_errors(self):
import unicodedata
self.assertRaises(TypeError, unicodedata.name)
self.assertRaises(TypeError, unicodedata.name, u'xx')
self.assertRaises(TypeError, unicodedata.lookup)
self.assertRaises(KeyError, unicodedata.lookup, u'unknown')
示例22
def U(name):
"""unicode character by name or None if not found"""
try:
u = unicodedata.lookup(name)
except KeyError:
u = None
global unicode_warnings
unicode_warnings += 'No \'%s\' in unicodedata\n' % name
return u
示例23
def test_ascii_letters(self):
import unicodedata
for char in "".join(map(chr, xrange(ord("a"), ord("z")))):
name = "LATIN SMALL LETTER %s" % char.upper()
code = unicodedata.lookup(name)
self.assertEqual(unicodedata.name(code), name)
示例24
def test_bmp_characters(self):
import unicodedata
count = 0
for code in xrange(0x10000):
char = unichr(code)
name = unicodedata.name(char, None)
if name is not None:
self.assertEqual(unicodedata.lookup(name), char)
count += 1
示例25
def test_errors(self):
import unicodedata
self.assertRaises(TypeError, unicodedata.name)
self.assertRaises(TypeError, unicodedata.name, u'xx')
self.assertRaises(TypeError, unicodedata.lookup)
self.assertRaises(KeyError, unicodedata.lookup, u'unknown')
示例26
def test_unicode(self, sparse):
# See GH 6885 - get_dummies chokes on unicode values
import unicodedata
e = 'e'
eacute = unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE')
s = [e, eacute, eacute]
res = get_dummies(s, prefix='letter', sparse=sparse)
exp = DataFrame({'letter_e': [1, 0, 0],
u('letter_%s') % eacute: [0, 1, 1]},
dtype=np.uint8)
if sparse:
exp = exp.apply(pd.SparseArray, fill_value=0)
assert_frame_equal(res, exp)
示例27
def test_ascii_letters(self):
for char in "".join(map(chr, range(ord("a"), ord("z")))):
name = "LATIN SMALL LETTER %s" % char.upper()
code = unicodedata.lookup(name)
self.assertEqual(unicodedata.name(code), name)
示例28
def test_bmp_characters(self):
for code in range(0x10000):
char = chr(code)
name = unicodedata.name(char, None)
if name is not None:
self.assertEqual(unicodedata.lookup(name), char)
示例29
def test_named_sequences_sample(self):
# Check a few named sequences. See #12753.
sequences = [
('LATIN SMALL LETTER R WITH TILDE', '\u0072\u0303'),
('TAMIL SYLLABLE SAI', '\u0BB8\u0BC8'),
('TAMIL SYLLABLE MOO', '\u0BAE\u0BCB'),
('TAMIL SYLLABLE NNOO', '\u0BA3\u0BCB'),
('TAMIL CONSONANT KSS', '\u0B95\u0BCD\u0BB7\u0BCD'),
]
for seqname, codepoints in sequences:
self.assertEqual(unicodedata.lookup(seqname), codepoints)
with self.assertRaises(SyntaxError):
self.checkletter(seqname, None)
with self.assertRaises(KeyError):
unicodedata.ucd_3_2_0.lookup(seqname)
示例30
def test_errors(self):
self.assertRaises(TypeError, unicodedata.name)
self.assertRaises(TypeError, unicodedata.name, 'xx')
self.assertRaises(TypeError, unicodedata.lookup)
self.assertRaises(KeyError, unicodedata.lookup, 'unknown')