当前位置：源码 > Python源码 >

Python源码示例：unicodedata.lookup()

示例1

def replace_unicode(self, m):
        """Replace escapes."""

        groups = m.groupdict()
        esc = m.group(0)
        if groups.get('fesc'):
            value = m.group(0)
        elif groups.get('format'):
            value = ' '
        elif groups.get('special'):
            value = BACK_SLASH_TRANSLATION[esc]
        elif groups.get('char'):
            try:
                value = chr(int(esc[2:], 16))
            except Exception:
                value = esc
        elif groups.get('oct'):
            value = chr(int(esc[1:], 8))
        elif groups.get('name'):
            try:
                value = unicodedata.lookup(esc[3:-1])
            except Exception:
                value = esc
        return value.replace('\x00', '\n')

示例2

def start_unichar(self, attr):
        if 'name' in attr:
            if 'code' in attr:
                self._syntax_error('<unichar/> invalid with both name and code attributes')
            try:
                v = unicodedata.lookup(attr['name'])
            except KeyError:
                self._syntax_error('<unichar/> invalid name attribute\n"%s"' % ascii(attr['name']))
                v = '\0'
        elif 'code' in attr:
            try:
                v = int(eval(attr['code']))
                v = chr(v) if isPy3 else unichr(v)
            except:
                self._syntax_error('<unichar/> invalid code attribute %s' % ascii(attr['code']))
                v = '\0'
        else:
            v = None
            if attr:
                self._syntax_error('<unichar/> invalid attribute %s' % list(attr.keys())[0])

        if v is not None:
            self.handle_data(v)
        self._push('unichar',_selfClosingTag='unichar')

示例3

def test_aliases(self):
        # Check that the aliases defined in the NameAliases.txt file work.
        # This should be updated when new aliases are added or the file
        # should be downloaded and parsed instead.  See #12753.
        aliases = [
            ('LATIN CAPITAL LETTER GHA', 0x01A2),
            ('LATIN SMALL LETTER GHA', 0x01A3),
            ('KANNADA LETTER LLLA', 0x0CDE),
            ('LAO LETTER FO FON', 0x0E9D),
            ('LAO LETTER FO FAY', 0x0E9F),
            ('LAO LETTER RO', 0x0EA3),
            ('LAO LETTER LO', 0x0EA5),
            ('TIBETAN MARK BKA- SHOG GI MGO RGYAN', 0x0FD0),
            ('YI SYLLABLE ITERATION MARK', 0xA015),
            ('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET', 0xFE18),
            ('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS', 0x1D0C5)
        ]
        for alias, codepoint in aliases:
            self.checkletter(alias, chr(codepoint))
            name = unicodedata.name(chr(codepoint))
            self.assertNotEqual(name, alias)
            self.assertEqual(unicodedata.lookup(alias),
                             unicodedata.lookup(name))
            with self.assertRaises(KeyError):
                unicodedata.ucd_3_2_0.lookup(alias)

示例4

def greek_letter_name_to_unicode(letter):
    # type: (str) -> str
    """
    Return a greek letter name as a Unicode character.

    Examples
    --------
    Lamda -> Λ    (Unicodedata library uses "lamda" for "lambda" :S!)
    Omega -> Ω
    omega -> ω
    """
    return unicodedata.lookup(
        "GREEK {case} LETTER {name}".format(
            case="SMALL" if letter == letter.lower() else "CAPITAL", name=letter.upper()
        )
    )

示例5

def test_aliases(self):
        # Check that the aliases defined in the NameAliases.txt file work.
        # This should be updated when new aliases are added or the file
        # should be downloaded and parsed instead.  See #12753.
        aliases = [
            ('LATIN CAPITAL LETTER GHA', 0x01A2),
            ('LATIN SMALL LETTER GHA', 0x01A3),
            ('KANNADA LETTER LLLA', 0x0CDE),
            ('LAO LETTER FO FON', 0x0E9D),
            ('LAO LETTER FO FAY', 0x0E9F),
            ('LAO LETTER RO', 0x0EA3),
            ('LAO LETTER LO', 0x0EA5),
            ('TIBETAN MARK BKA- SHOG GI MGO RGYAN', 0x0FD0),
            ('YI SYLLABLE ITERATION MARK', 0xA015),
            ('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET', 0xFE18),
            ('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS', 0x1D0C5)
        ]
        for alias, codepoint in aliases:
            self.checkletter(alias, chr(codepoint))
            name = unicodedata.name(chr(codepoint))
            self.assertNotEqual(name, alias)
            self.assertEqual(unicodedata.lookup(alias),
                             unicodedata.lookup(name))
            with self.assertRaises(KeyError):
                unicodedata.ucd_3_2_0.lookup(alias)

示例6

def test_named_sequences_full(self):
        # Check all the named sequences
        url = ("http://www.pythontest.net/unicode/%s/NamedSequences.txt" %
               unicodedata.unidata_version)
        try:
            testdata = support.open_urlresource(url, encoding="utf-8",
                                                check=check_version)
        except (OSError, HTTPException):
            self.skipTest("Could not retrieve " + url)
        self.addCleanup(testdata.close)
        for line in testdata:
            line = line.strip()
            if not line or line.startswith('#'):
                continue
            seqname, codepoints = line.split(';')
            codepoints = ''.join(chr(int(cp, 16)) for cp in codepoints.split())
            self.assertEqual(unicodedata.lookup(seqname), codepoints)
            with self.assertRaises(SyntaxError):
                self.checkletter(seqname, None)
            with self.assertRaises(KeyError):
                unicodedata.ucd_3_2_0.lookup(seqname)

示例7

def start_unichar(self, attr):
        if 'name' in attr:
            if 'code' in attr:
                self._syntax_error('<unichar/> invalid with both name and code attributes')
            try:
                v = unicodedata.lookup(attr['name']).encode('utf8')
            except KeyError:
                self._syntax_error('<unichar/> invalid name attribute\n"%s"' % name)
                v = '\0'
        elif 'code' in attr:
            try:
                v = unichr(int(eval(attr['code']))).encode('utf8')
            except:
                self._syntax_error('<unichar/> invalid code attribute %s' % attr['code'])
                v = '\0'
        else:
            v = None
            if attr:
                self._syntax_error('<unichar/> invalid attribute %s' % attr.keys()[0])

        if v is not None:
            self.handle_data(v)
        self._push(_selfClosingTag='unichar')

示例8

def test_aliases(self):
        # Check that the aliases defined in the NameAliases.txt file work.
        # This should be updated when new aliases are added or the file
        # should be downloaded and parsed instead.  See #12753.
        aliases = [
            ('LATIN CAPITAL LETTER GHA', 0x01A2),
            ('LATIN SMALL LETTER GHA', 0x01A3),
            ('KANNADA LETTER LLLA', 0x0CDE),
            ('LAO LETTER FO FON', 0x0E9D),
            ('LAO LETTER FO FAY', 0x0E9F),
            ('LAO LETTER RO', 0x0EA3),
            ('LAO LETTER LO', 0x0EA5),
            ('TIBETAN MARK BKA- SHOG GI MGO RGYAN', 0x0FD0),
            ('YI SYLLABLE ITERATION MARK', 0xA015),
            ('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET', 0xFE18),
            ('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS', 0x1D0C5)
        ]
        for alias, codepoint in aliases:
            self.checkletter(alias, chr(codepoint))
            name = unicodedata.name(chr(codepoint))
            self.assertNotEqual(name, alias)
            self.assertEqual(unicodedata.lookup(alias),
                             unicodedata.lookup(name))
            with self.assertRaises(KeyError):
                unicodedata.ucd_3_2_0.lookup(alias)

示例9

def test_named_sequences_full(self):
        # Check all the named sequences
        url = ("http://www.pythontest.net/unicode/%s/NamedSequences.txt" %
               unicodedata.unidata_version)
        try:
            testdata = support.open_urlresource(url, encoding="utf-8",
                                                check=check_version)
        except (OSError, HTTPException):
            self.skipTest("Could not retrieve " + url)
        self.addCleanup(testdata.close)
        for line in testdata:
            line = line.strip()
            if not line or line.startswith('#'):
                continue
            seqname, codepoints = line.split(';')
            codepoints = ''.join(chr(int(cp, 16)) for cp in codepoints.split())
            self.assertEqual(unicodedata.lookup(seqname), codepoints)
            with self.assertRaises(SyntaxError):
                self.checkletter(seqname, None)
            with self.assertRaises(KeyError):
                unicodedata.ucd_3_2_0.lookup(seqname)

示例10

def _greekletters(letterlist):
    for l in letterlist:
        ucharname = l.upper()
        if ucharname == 'LAMBDA':
            ucharname = 'LAMDA'
        smallname = "GREEK SMALL LETTER "+ucharname
        if ucharname == 'EPSILON':
            smallname = "GREEK LUNATE EPSILON SYMBOL"
        if ucharname == 'PHI':
            smallname = "GREEK PHI SYMBOL"
        _latex_specs_base['macros'].append(
            MacroTextSpec(l, unicodedata.lookup(smallname))
        )
        _latex_specs_base['macros'].append(
            MacroTextSpec(l[0].upper()+l[1:], unicodedata.lookup("GREEK CAPITAL LETTER "+ucharname))
            )

示例11

def test_unicode_whitespace(self):
        # Test for http://bugs.jython.org/issue2226
        ws_re = re.compile(r'\s', re.UNICODE)
        not_ws_re = re.compile(r'\S', re.UNICODE)
        separator_categories = set(['Zl', 'Zp', 'Zs'])
        separators = {chr(c) for c in [28, 29, 30, 31]}
        special = set([
            unicodedata.lookup('MONGOLIAN VOWEL SEPARATOR'),
            u'\u0085', # NEXT LINE (NEL)
            ])
        cpython_whitespace = set(' \t\n\r\f\v') | separators | special
        for i in xrange(0xFFFF): # could test to sys.maxunicode, but does not appear to be necessary
            if i >= 0xD800 and i <= 0xDFFF:
                continue
            c = unichr(i)
            if c in cpython_whitespace or category(c) in separator_categories:
                self.assertRegexpMatches(c, ws_re)
                self.assertNotRegexpMatches(c, not_ws_re)
            else:
                self.assertNotRegexpMatches(c, ws_re)
                self.assertRegexpMatches(c, not_ws_re)

示例12

def _greekletters(letterlist):
    for l in letterlist:
        ucharname = l.upper()
        if (ucharname == 'LAMBDA'):
            ucharname = 'LAMDA'
        smallname = "GREEK SMALL LETTER "+ucharname;
        if (ucharname == 'EPSILON'):
            smallname = "GREEK LUNATE EPSILON SYMBOL"
        if (ucharname == 'PHI'):
            smallname = "GREEK PHI SYMBOL"
        _default_macro_list.append(
            (l, unicodedata.lookup(smallname))
            );
        _default_macro_list.append(
            (l[0].upper()+l[1:], unicodedata.lookup("GREEK CAPITAL LETTER "+ucharname))
            );

示例13

def test_unicode_whitespace(self):
        # Test for http://bugs.jython.org/issue2226
        ws_re = re.compile(r'\s', re.UNICODE)
        not_ws_re = re.compile(r'\S', re.UNICODE)
        separator_categories = set(['Zl', 'Zp', 'Zs'])
        separators = {chr(c) for c in [28, 29, 30, 31]}
        special = set([
            unicodedata.lookup('MONGOLIAN VOWEL SEPARATOR'),
            u'\u0085', # NEXT LINE (NEL)
            ])
        cpython_whitespace = set(' \t\n\r\f\v') | separators | special
        for i in xrange(0xFFFF): # could test to sys.maxunicode, but does not appear to be necessary
            if i >= 0xD800 and i <= 0xDFFF:
                continue
            c = unichr(i)
            if c in cpython_whitespace or category(c) in separator_categories:
                self.assertRegexpMatches(c, ws_re)
                self.assertNotRegexpMatches(c, not_ws_re)
            else:
                self.assertNotRegexpMatches(c, ws_re)
                self.assertRegexpMatches(c, not_ws_re)

示例14

def test_unicode(self, sparse):
        # See GH 6885 - get_dummies chokes on unicode values
        import unicodedata
        e = 'e'
        eacute = unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE')
        s = [e, eacute, eacute]
        res = get_dummies(s, prefix='letter', sparse=sparse)
        exp = DataFrame({'letter_e': [1, 0, 0],
                         u('letter_%s') % eacute: [0, 1, 1]},
                        dtype=np.uint8)
        if sparse:
            exp = exp.apply(pd.SparseArray, fill_value=0)
        assert_frame_equal(res, exp)

示例15

def test_ascii_letters(self):
        import unicodedata

        for char in "".join(map(chr, xrange(ord("a"), ord("z")))):
            name = "LATIN SMALL LETTER %s" % char.upper()
            code = unicodedata.lookup(name)
            self.assertEqual(unicodedata.name(code), name)

示例16

def test_bmp_characters(self):
        import unicodedata
        count = 0
        for code in xrange(0x10000):
            char = unichr(code)
            name = unicodedata.name(char, None)
            if name is not None:
                self.assertEqual(unicodedata.lookup(name), char)
                count += 1

示例17

def test_errors(self):
        import unicodedata
        self.assertRaises(TypeError, unicodedata.name)
        self.assertRaises(TypeError, unicodedata.name, u'xx')
        self.assertRaises(TypeError, unicodedata.lookup)
        self.assertRaises(KeyError, unicodedata.lookup, u'unknown')

示例18

def test_unicode(self, sparse):
        # See GH 6885 - get_dummies chokes on unicode values
        import unicodedata
        e = 'e'
        eacute = unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE')
        s = [e, eacute, eacute]
        res = get_dummies(s, prefix='letter', sparse=sparse)
        exp = DataFrame({'letter_e': [1, 0, 0],
                         u('letter_%s') % eacute: [0, 1, 1]},
                        dtype=np.uint8)
        assert_frame_equal(res, exp)

示例19

def test_ascii_letters(self):
        import unicodedata

        for char in "".join(map(chr, xrange(ord("a"), ord("z")))):
            name = "LATIN SMALL LETTER %s" % char.upper()
            code = unicodedata.lookup(name)
            self.assertEqual(unicodedata.name(code), name)

示例20

def test_bmp_characters(self):
        import unicodedata
        count = 0
        for code in xrange(0x10000):
            char = unichr(code)
            name = unicodedata.name(char, None)
            if name is not None:
                self.assertEqual(unicodedata.lookup(name), char)
                count += 1

示例21

def test_errors(self):
        import unicodedata
        self.assertRaises(TypeError, unicodedata.name)
        self.assertRaises(TypeError, unicodedata.name, u'xx')
        self.assertRaises(TypeError, unicodedata.lookup)
        self.assertRaises(KeyError, unicodedata.lookup, u'unknown')

示例22

def U(name):
        """unicode character by name or None if not found"""
        try:
            u = unicodedata.lookup(name)
        except KeyError:
            u = None

            global unicode_warnings
            unicode_warnings += 'No \'%s\' in unicodedata\n' % name

        return u

示例23

def test_ascii_letters(self):
        import unicodedata

        for char in "".join(map(chr, xrange(ord("a"), ord("z")))):
            name = "LATIN SMALL LETTER %s" % char.upper()
            code = unicodedata.lookup(name)
            self.assertEqual(unicodedata.name(code), name)

示例24

def test_bmp_characters(self):
        import unicodedata
        count = 0
        for code in xrange(0x10000):
            char = unichr(code)
            name = unicodedata.name(char, None)
            if name is not None:
                self.assertEqual(unicodedata.lookup(name), char)
                count += 1

示例25

def test_errors(self):
        import unicodedata
        self.assertRaises(TypeError, unicodedata.name)
        self.assertRaises(TypeError, unicodedata.name, u'xx')
        self.assertRaises(TypeError, unicodedata.lookup)
        self.assertRaises(KeyError, unicodedata.lookup, u'unknown')

示例26

def test_unicode(self, sparse):
        # See GH 6885 - get_dummies chokes on unicode values
        import unicodedata
        e = 'e'
        eacute = unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE')
        s = [e, eacute, eacute]
        res = get_dummies(s, prefix='letter', sparse=sparse)
        exp = DataFrame({'letter_e': [1, 0, 0],
                         u('letter_%s') % eacute: [0, 1, 1]},
                        dtype=np.uint8)
        if sparse:
            exp = exp.apply(pd.SparseArray, fill_value=0)
        assert_frame_equal(res, exp)

示例27

def test_ascii_letters(self):
        for char in "".join(map(chr, range(ord("a"), ord("z")))):
            name = "LATIN SMALL LETTER %s" % char.upper()
            code = unicodedata.lookup(name)
            self.assertEqual(unicodedata.name(code), name)

示例28

def test_bmp_characters(self):
        for code in range(0x10000):
            char = chr(code)
            name = unicodedata.name(char, None)
            if name is not None:
                self.assertEqual(unicodedata.lookup(name), char)

示例29

def test_named_sequences_sample(self):
        # Check a few named sequences.  See #12753.
        sequences = [
            ('LATIN SMALL LETTER R WITH TILDE', '\u0072\u0303'),
            ('TAMIL SYLLABLE SAI', '\u0BB8\u0BC8'),
            ('TAMIL SYLLABLE MOO', '\u0BAE\u0BCB'),
            ('TAMIL SYLLABLE NNOO', '\u0BA3\u0BCB'),
            ('TAMIL CONSONANT KSS', '\u0B95\u0BCD\u0BB7\u0BCD'),
        ]
        for seqname, codepoints in sequences:
            self.assertEqual(unicodedata.lookup(seqname), codepoints)
            with self.assertRaises(SyntaxError):
                self.checkletter(seqname, None)
            with self.assertRaises(KeyError):
                unicodedata.ucd_3_2_0.lookup(seqname)

示例30

def test_errors(self):
        self.assertRaises(TypeError, unicodedata.name)
        self.assertRaises(TypeError, unicodedata.name, 'xx')
        self.assertRaises(TypeError, unicodedata.lookup)
        self.assertRaises(KeyError, unicodedata.lookup, 'unknown')

Python源码示例：unicodedata.lookup()

微信关注