Python源码示例:unicodedata.name()
示例1
def main(argv):
if len(argv) < 2:
sys.exit('Must specify one or more font files.')
cps = set()
for filename in argv[1:]:
if not os.path.isfile(filename):
sys.exit('%s is not a file' % filename)
cps |= fonts.CodepointsInFont(filename)
for cp in sorted(cps):
show_char = ''
if FLAGS.show_char:
show_char = (' ' + unichr(cp).strip() + ' ' +
unicodedata.name(unichr(cp), ''))
show_subset = ''
if FLAGS.show_subsets:
show_subset = ' subset:%s' % ','.join(fonts.SubsetsForCodepoint(cp))
print(u'0x%04X%s%s' % (cp, show_char, show_subset))
示例2
def main(global_delay, local_delay, concurrency):
global global_sleep, local_sleep, semaphore, index
global_sleep = global_delay
local_sleep = local_delay
semaphore = asyncio.Semaphore(concurrency)
print('Global delay =', global_delay)
print('Local delay =', local_delay)
print('Max. concurrency =', concurrency)
print('Building inverted index...')
index = build_index()
app = web.Application()
app.router.add_get('/', usage)
app.router.add_get('/index/{word}', index_for)
app.router.add_get('/name/{char}', char_name)
print('Listening on port', PORT)
web.run_app(app, port=PORT)
示例3
def test_longstrings(self):
# test long strings to check for memory overflow problems
errors = [ "strict", "ignore", "replace", "xmlcharrefreplace",
"backslashreplace"]
# register the handlers under different names,
# to prevent the codec from recognizing the name
for err in errors:
codecs.register_error("test." + err, codecs.lookup_error(err))
l = 1000
errors += [ "test." + err for err in errors ]
for uni in [ s*l for s in (u"x", u"\u3042", u"a\xe4") ]:
for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15",
"utf-8", "utf-7", "utf-16", "utf-32"):
for err in errors:
try:
uni.encode(enc, err)
except UnicodeError:
pass
示例4
def test_hangul_syllables(self):
self.checkletter("HANGUL SYLLABLE GA", u"\uac00")
self.checkletter("HANGUL SYLLABLE GGWEOSS", u"\uafe8")
self.checkletter("HANGUL SYLLABLE DOLS", u"\ub3d0")
self.checkletter("HANGUL SYLLABLE RYAN", u"\ub7b8")
self.checkletter("HANGUL SYLLABLE MWIK", u"\ubba0")
self.checkletter("HANGUL SYLLABLE BBWAEM", u"\ubf88")
self.checkletter("HANGUL SYLLABLE SSEOL", u"\uc370")
self.checkletter("HANGUL SYLLABLE YI", u"\uc758")
self.checkletter("HANGUL SYLLABLE JJYOSS", u"\ucb40")
self.checkletter("HANGUL SYLLABLE KYEOLS", u"\ucf28")
self.checkletter("HANGUL SYLLABLE PAN", u"\ud310")
self.checkletter("HANGUL SYLLABLE HWEOK", u"\ud6f8")
self.checkletter("HANGUL SYLLABLE HIH", u"\ud7a3")
import unicodedata
self.assertRaises(ValueError, unicodedata.name, u"\ud7a4")
示例5
def test_strict_eror_handling(self):
# bogus character name
self.assertRaises(
UnicodeError,
unicode, "\\N{blah}", 'unicode-escape', 'strict'
)
# long bogus character name
self.assertRaises(
UnicodeError,
unicode, "\\N{%s}" % ("x" * 100000), 'unicode-escape', 'strict'
)
# missing closing brace
self.assertRaises(
UnicodeError,
unicode, "\\N{SPACE", 'unicode-escape', 'strict'
)
# missing opening brace
self.assertRaises(
UnicodeError,
unicode, "\\NSPACE", 'unicode-escape', 'strict'
)
示例6
def test_ipy2_gh357(self):
"""https://github.com/IronLanguages/ironpython2/issues/357"""
import unicodedata
if is_cli:
self.assertEqual(unicodedata.name(u'\u4e2d'), '<CJK IDEOGRAPH, FIRST>..<CJK IDEOGRAPH, LAST>')
else:
self.assertEqual(unicodedata.name(u'\u4e2d'), 'CJK UNIFIED IDEOGRAPH-4E2D')
self.assertRaises(ValueError, unicodedata.decimal, u'\u4e2d')
self.assertEqual(unicodedata.decimal(u'\u4e2d', 0), 0)
self.assertRaises(ValueError, unicodedata.digit, u'\u4e2d')
self.assertEqual(unicodedata.digit(u'\u4e2d', 0), 0)
self.assertRaises(ValueError, unicodedata.numeric, u'\u4e2d')
self.assertEqual(unicodedata.numeric(u'\u4e2d', 0), 0)
self.assertEqual(unicodedata.category(u'\u4e2d'), 'Lo')
self.assertEqual(unicodedata.bidirectional(u'\u4e2d'), 'L')
self.assertEqual(unicodedata.combining(u'\u4e2d'), 0)
self.assertEqual(unicodedata.east_asian_width(u'\u4e2d'), 'W')
self.assertEqual(unicodedata.mirrored(u'\u4e2d'), 0)
self.assertEqual(unicodedata.decomposition(u'\u4e2d'), '')
示例7
def test_longstrings(self):
# test long strings to check for memory overflow problems
errors = [ "strict", "ignore", "replace", "xmlcharrefreplace",
"backslashreplace"]
# register the handlers under different names,
# to prevent the codec from recognizing the name
for err in errors:
codecs.register_error("test." + err, codecs.lookup_error(err))
l = 1000
errors += [ "test." + err for err in errors ]
for uni in [ s*l for s in (u"x", u"\u3042", u"a\xe4") ]:
for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15",
"utf-8", "utf-7", "utf-16", "utf-32"):
for err in errors:
try:
uni.encode(enc, err)
except UnicodeError:
pass
示例8
def test_hangul_syllables(self):
self.checkletter("HANGUL SYLLABLE GA", u"\uac00")
self.checkletter("HANGUL SYLLABLE GGWEOSS", u"\uafe8")
self.checkletter("HANGUL SYLLABLE DOLS", u"\ub3d0")
self.checkletter("HANGUL SYLLABLE RYAN", u"\ub7b8")
self.checkletter("HANGUL SYLLABLE MWIK", u"\ubba0")
self.checkletter("HANGUL SYLLABLE BBWAEM", u"\ubf88")
self.checkletter("HANGUL SYLLABLE SSEOL", u"\uc370")
self.checkletter("HANGUL SYLLABLE YI", u"\uc758")
self.checkletter("HANGUL SYLLABLE JJYOSS", u"\ucb40")
self.checkletter("HANGUL SYLLABLE KYEOLS", u"\ucf28")
self.checkletter("HANGUL SYLLABLE PAN", u"\ud310")
self.checkletter("HANGUL SYLLABLE HWEOK", u"\ud6f8")
self.checkletter("HANGUL SYLLABLE HIH", u"\ud7a3")
import unicodedata
self.assertRaises(ValueError, unicodedata.name, u"\ud7a4")
示例9
def test_strict_eror_handling(self):
# bogus character name
self.assertRaises(
UnicodeError,
unicode, "\\N{blah}", 'unicode-escape', 'strict'
)
# long bogus character name
self.assertRaises(
UnicodeError,
unicode, "\\N{%s}" % ("x" * 100000), 'unicode-escape', 'strict'
)
# missing closing brace
self.assertRaises(
UnicodeError,
unicode, "\\N{SPACE", 'unicode-escape', 'strict'
)
# missing opening brace
self.assertRaises(
UnicodeError,
unicode, "\\NSPACE", 'unicode-escape', 'strict'
)
示例10
def get_unicode_index(symbol):
"""get_unicode_index(symbol) -> integer
Return the integer index (from the Unicode table) of symbol. *symbol*
can be a single unicode character, a TeX command (i.e. r'\pi'), or a
Type1 symbol name (i.e. 'phi').
"""
# From UTF #25: U+2212 minus sign is the preferred
# representation of the unary and binary minus sign rather than
# the ASCII-derived U+002D hyphen-minus, because minus sign is
# unambiguous and because it is rendered with a more desirable
# length, usually longer than a hyphen.
if symbol == '-':
return 0x2212
try:# This will succeed if symbol is a single unicode char
return ord(symbol)
except TypeError:
pass
try:# Is symbol a TeX symbol (i.e. \alpha)
return tex2uni[symbol.strip("\\")]
except KeyError:
message = """'%(symbol)s' is not a valid Unicode character or
TeX/Type1 symbol"""%locals()
raise ValueError(message)
示例11
def render_glyph(self, ox, oy, facename, font_class, sym, fontsize, dpi):
"""
Draw a glyph at
- *ox*, *oy*: position
- *facename*: One of the TeX face names
- *font_class*:
- *sym*: TeX symbol name or single character
- *fontsize*: fontsize in points
- *dpi*: The dpi to draw at.
"""
info = self._get_info(facename, font_class, sym, fontsize, dpi)
realpath, stat_key = get_realpath_and_stat(info.font.fname)
used_characters = self.used_characters.setdefault(
stat_key, (realpath, set()))
used_characters[1].add(info.num)
self.mathtext_backend.render_glyph(ox, oy, info)
示例12
def test_longstrings(self):
# test long strings to check for memory overflow problems
errors = [ "strict", "ignore", "replace", "xmlcharrefreplace",
"backslashreplace"]
# register the handlers under different names,
# to prevent the codec from recognizing the name
for err in errors:
codecs.register_error("test." + err, codecs.lookup_error(err))
l = 1000
errors += [ "test." + err for err in errors ]
for uni in [ s*l for s in (u"x", u"\u3042", u"a\xe4") ]:
for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15",
"utf-8", "utf-7", "utf-16", "utf-32"):
for err in errors:
try:
uni.encode(enc, err)
except UnicodeError:
pass
示例13
def test_hangul_syllables(self):
self.checkletter("HANGUL SYLLABLE GA", u"\uac00")
self.checkletter("HANGUL SYLLABLE GGWEOSS", u"\uafe8")
self.checkletter("HANGUL SYLLABLE DOLS", u"\ub3d0")
self.checkletter("HANGUL SYLLABLE RYAN", u"\ub7b8")
self.checkletter("HANGUL SYLLABLE MWIK", u"\ubba0")
self.checkletter("HANGUL SYLLABLE BBWAEM", u"\ubf88")
self.checkletter("HANGUL SYLLABLE SSEOL", u"\uc370")
self.checkletter("HANGUL SYLLABLE YI", u"\uc758")
self.checkletter("HANGUL SYLLABLE JJYOSS", u"\ucb40")
self.checkletter("HANGUL SYLLABLE KYEOLS", u"\ucf28")
self.checkletter("HANGUL SYLLABLE PAN", u"\ud310")
self.checkletter("HANGUL SYLLABLE HWEOK", u"\ud6f8")
self.checkletter("HANGUL SYLLABLE HIH", u"\ud7a3")
import unicodedata
self.assertRaises(ValueError, unicodedata.name, u"\ud7a4")
示例14
def test_strict_eror_handling(self):
# bogus character name
self.assertRaises(
UnicodeError,
unicode, "\\N{blah}", 'unicode-escape', 'strict'
)
# long bogus character name
self.assertRaises(
UnicodeError,
unicode, "\\N{%s}" % ("x" * 100000), 'unicode-escape', 'strict'
)
# missing closing brace
self.assertRaises(
UnicodeError,
unicode, "\\N{SPACE", 'unicode-escape', 'strict'
)
# missing opening brace
self.assertRaises(
UnicodeError,
unicode, "\\NSPACE", 'unicode-escape', 'strict'
)
示例15
def unicode(self, irc, msg, args, query):
"""[character]
Look up unicode character details
"""
url = "http://unicodelookup.com/lookup?"
url = url + urlencode({"q": query, "o": 0})
data = web.getUrl(url)
try:
data = json.loads(data)
responses = []
for result in data["results"]:
ucode = result[2].replace("0x", "U+")
name = unicodedata.name("{0}".format(query))
responses.append(
"%s (%s): %s [HTML: %s / Decimal: %s / Hex: %s]"
% (ucode, name, result[4], result[3], result[1], result[2])
)
response = "; ".join(responses)
irc.reply(response)
except ValueError:
irc.reply("No unicode characters matching /" + query + "/ found.")
示例16
def get_unicode_index(symbol):
"""get_unicode_index(symbol) -> integer
Return the integer index (from the Unicode table) of symbol. *symbol*
can be a single unicode character, a TeX command (i.e. r'\pi'), or a
Type1 symbol name (i.e. 'phi').
"""
# From UTF #25: U+2212 minus sign is the preferred
# representation of the unary and binary minus sign rather than
# the ASCII-derived U+002D hyphen-minus, because minus sign is
# unambiguous and because it is rendered with a more desirable
# length, usually longer than a hyphen.
if symbol == '-':
return 0x2212
try:# This will succeed if symbol is a single unicode char
return ord(symbol)
except TypeError:
pass
try:# Is symbol a TeX symbol (i.e. \alpha)
return tex2uni[symbol.strip("\\")]
except KeyError:
message = """'%(symbol)s' is not a valid Unicode character or
TeX/Type1 symbol"""%locals()
raise ValueError(message)
示例17
def render_glyph(self, ox, oy, facename, font_class, sym, fontsize, dpi):
"""
Draw a glyph at
- *ox*, *oy*: position
- *facename*: One of the TeX face names
- *font_class*:
- *sym*: TeX symbol name or single character
- *fontsize*: fontsize in points
- *dpi*: The dpi to draw at.
"""
info = self._get_info(facename, font_class, sym, fontsize, dpi)
realpath, stat_key = get_realpath_and_stat(info.font.fname)
used_characters = self.used_characters.setdefault(
stat_key, (realpath, set()))
used_characters[1].add(info.num)
self.mathtext_backend.render_glyph(ox, oy, info)
示例18
def remove_diacritics(self):
"""
:return: str: the input string stripped of its diacritics
Examples:
>>> Word('ġelǣd').remove_diacritics()
'gelæd'
"""
w = ''
for c in unicodedata.normalize('NFKD', self.word):
if 'LATIN' == unicodedata.name(c)[:5]:
w += c
return w
示例19
def __init__(self, unicodeHexValue, block):
""" Set up a unicode character.
Arguments:
unicodeHexValue -- an integer that should correspond to a
Unicode code point.
block -- the CharacterBlock this character belongs to.
Raises:
ValueError -- if unicodeHexValue is not a valid code point.
"""
if unicodeHexValue < 0 or unicodeHexValue > 0x10FFFF:
raise ValueError("numeric value outside Unicode range")
self.unicodeHexValue = unicodeHexValue
""" Use name check to filter out unused characters.
unicodedata.name() raises ValueError for these
"""
self.unichr = py23char(self.unicodeHexValue)
self.name = unicodedata.name(self.unichr)
self.equivalents = {}
self._block = block
示例20
def _equivalent(self, char, prev, next, implicitA):
""" Transliterate a Devanagari character to Latin.
Add implicit As unless overridden by VIRAMA.
"""
result = []
if char.unichr != DevanagariCharacter._VIRAMA:
result.append(char.equivalents[self.name])
""" Append implicit A to consonants if the next character isn't a vowel. """
if implicitA and char.isConsonant \
and ((next is not None \
and next.unichr != DevanagariCharacter._VIRAMA \
and not next.isVowel) \
or next is None):
result.append(characterBlocks['DEVANAGARI']\
[DevanagariCharacter._LETTER_A].equivalents[self.name])
return result
示例21
def __init__(self, unicodeHexValue, block):
""" Set up a unicode character.
Arguments:
unicodeHexValue -- an integer that should correspond to a
Unicode code point.
block -- the CharacterBlock this character belongs to.
Raises:
ValueError -- if unicodeHexValue is not a valid code point.
"""
if unicodeHexValue < 0 or unicodeHexValue > 0x10FFFF:
raise (ValueError, "numeric value outside Unicode range")
self.unicodeHexValue = unicodeHexValue
""" Use name check to filter out unused characters.
unicodedata.name() raises ValueError for these
"""
self.chr = chr(self.unicodeHexValue)
self.name = unicodedata.name(self.chr)
self.equivalents = {}
self._block = block
示例22
def _equivalent(self, char, prev, next, implicitA):
""" Transliterate a Devanagari character to Latin.
Add implicit As unless overridden by VIRAMA.
"""
implicitA = False # Force it!
result = []
if char.chr != DevanagariCharacter._VIRAMA:
result.append(char.equivalents[self.name])
""" Append implicit A to consonants if the next character isn't a vowel. """
if implicitA and char.isConsonant \
and ((next is not None \
and next.chr != DevanagariCharacter._VIRAMA \
and not next.isVowel) \
or next is None):
result.append(characterBlocks['DEVANAGARI']\
[DevanagariCharacter._LETTER_A].equivalents[self.name])
return result
示例23
def _combining_class(cp):
v = unicodedata.combining(unichr(cp))
if v == 0:
if not unicodedata.name(unichr(cp)):
raise ValueError("Unknown character in unicodedata")
return v
示例24
def _combining_class(cp):
v = unicodedata.combining(unichr(cp))
if v == 0:
if not unicodedata.name(unichr(cp)):
raise ValueError("Unknown character in unicodedata")
return v
示例25
def _combining_class(cp):
v = unicodedata.combining(unichr(cp))
if v == 0:
if not unicodedata.name(unichr(cp)):
raise ValueError("Unknown character in unicodedata")
return v
示例26
def find_unicodedata_name(data: str) -> list:
"""查询Unicode编码中的名字
♠ == BLACK SPADE SUIT
\\N{BLACK SPADE SUIT} == ♠
:param data: 字符串
:return: 字符的Unicode名字列表
"""
ls = []
for i in data:
ls.append(unicodedata.name(i))
return ls
示例27
def charinfo(self, ctx: Context, *, characters: str) -> None:
"""Shows you information on up to 25 unicode characters."""
match = re.match(r"<(a?):(\w+):(\d+)>", characters)
if match:
embed = Embed(
title="Non-Character Detected",
description=(
"Only unicode characters can be processed, but a custom Discord emoji "
"was found. Please remove it and try again."
)
)
embed.colour = Colour.red()
await ctx.send(embed=embed)
return
if len(characters) > 25:
embed = Embed(title=f"Too many characters ({len(characters)}/25)")
embed.colour = Colour.red()
await ctx.send(embed=embed)
return
def get_info(char: str) -> Tuple[str, str]:
digit = f"{ord(char):x}"
if len(digit) <= 4:
u_code = f"\\u{digit:>04}"
else:
u_code = f"\\U{digit:>08}"
url = f"https://www.compart.com/en/unicode/U+{digit:>04}"
name = f"[{unicodedata.name(char, '')}]({url})"
info = f"`{u_code.ljust(10)}`: {name} - {utils.escape_markdown(char)}"
return info, u_code
charlist, rawlist = zip(*(get_info(c) for c in characters))
embed = Embed(description="\n".join(charlist))
embed.set_author(name="Character Info")
if len(characters) > 1:
embed.add_field(name='Raw', value=f"`{''.join(rawlist)}`", inline=False)
await ctx.send(embed=embed)
示例28
def send_pep_zero(self, ctx: Context) -> None:
"""Send information about PEP 0."""
pep_embed = Embed(
title="**PEP 0 - Index of Python Enhancement Proposals (PEPs)**",
description="[Link](https://www.python.org/dev/peps/)"
)
pep_embed.set_thumbnail(url=ICON_URL)
pep_embed.add_field(name="Status", value="Active")
pep_embed.add_field(name="Created", value="13-Jul-2000")
pep_embed.add_field(name="Type", value="Informational")
await ctx.send(embed=pep_embed)
示例29
def _ReformatLine(line):
if line.startswith('0x'):
codepoint = int(line[2:6], 16)
out = unichr(codepoint) + ' ' + unicodedata.name(unichr(codepoint), '')
return '0x%04X %s' % (codepoint, out)
else:
return line
示例30
def validate_unicode_normalization(text):
"""
Tests that letters composed of multiple Unicode characters (such as a base
letter plus combining diacritics) conform to NFC normalization (canonical
decomposition followed by canonical composition).
"""
normalized_text = unicodedata.normalize('NFC', text)
if text != normalized_text:
# Find the first unmatched character and include it in the report.
firsti = -1
firstj = -1
inpfirst = ''
nfcfirst = ''
tcols = text.split("\t")
ncols = normalized_text.split("\t")
for i in range(len(tcols)):
for j in range(len(tcols[i])):
if tcols[i][j] != ncols[i][j]:
firsti = i
firstj = j
inpfirst = unicodedata.name(tcols[i][j])
nfcfirst = unicodedata.name(ncols[i][j])
break
if firsti >= 0:
break
testlevel = 1
testclass = 'Unicode'
testid = 'unicode-normalization'
testmessage = "Unicode not normalized: %s.character[%d] is %s, should be %s." % (COLNAMES[firsti], firstj, inpfirst, nfcfirst)
warn(testmessage, testclass, testlevel=testlevel, testid=testid)