Skip to content

Commit f1253c5

Browse files
committed
🎉 学习第4章Unicode文本和字节序列
1 parent ce910b4 commit f1253c5

File tree

6 files changed

+1067
-0
lines changed

6 files changed

+1067
-0
lines changed

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# 《流畅的Python》阅读笔记
2+
3+
原书的项目地址:https://github.com/fluentpython/example-code-2e

codes/ch04/default_encoding.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#!/usr/bin/env python
2+
# encoding: utf-8
3+
"""
4+
@author: HuRuiFeng
5+
@file: default_encoding.py
6+
@time: 2023/9/18 13:09
7+
@project: fluent-python
8+
@desc: P100 探索默认编码
9+
"""
10+
11+
import locale
12+
import sys
13+
14+
expressions = """
15+
locale.getpreferredencoding()
16+
type(my_file)
17+
my_file.encoding
18+
sys.stdout.isatty()
19+
sys.stdout.encoding
20+
sys.stdin.isatty()
21+
sys.stdin.encoding
22+
sys.stderr.isatty()
23+
sys.stderr.encoding
24+
sys.getdefaultencoding()
25+
sys.getfilesystemencoding()
26+
"""
27+
28+
my_file = open('dummy', 'w')
29+
30+
for expression in expressions.split():
31+
value = eval(expression)
32+
print(f'{expression:>30} -> {value!r}')
33+

codes/ch04/normeq.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#!/usr/bin/env python
2+
# encoding: utf-8
3+
"""
4+
@author: HuRuiFeng
5+
@file: normeq.py
6+
@time: 2023/9/18 14:11
7+
@project: fluent-python
8+
@desc: P108 规范化Unicode字符串,准确比较
9+
"""
10+
from unicodedata import normalize
11+
12+
13+
def nfc_equal(str1, str2):
14+
return normalize('NFC', str1) == normalize('NFC', str2)
15+
16+
17+
def fold_equal(str1, str2):
18+
return (normalize('NFC', str1).casefold() ==
19+
normalize('NFC', str2).casefold())
20+
21+
22+
if __name__ == '__main__':
23+
s1 = 'café'
24+
s2 = 'cafe\u0301'
25+
print(s1 == s2)
26+
print(nfc_equal(s1, s2))
27+
print(nfc_equal('A', 'a'))

codes/ch04/ramanujan.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#!/usr/bin/env python
2+
# encoding: utf-8
3+
"""
4+
@author: HuRuiFeng
5+
@file: ramanujan.py
6+
@time: 2023/9/18 15:56
7+
@project: fluent-python
8+
@desc: P117 比较简单的str和bytes正则表达式的行为
9+
"""
10+
import re
11+
12+
# str类型
13+
re_numbers_str = re.compile(r'\d+')
14+
re_words_str = re.compile(r'\w+')
15+
# bytes类型
16+
re_numbers_bytes = re.compile(rb'\d+')
17+
re_words_bytes = re.compile(rb'\w+')
18+
19+
text_str = ("Ramanujan saw \u0be7\u0bed\u0be8\u0bef"
20+
" as 1729 = 1³ + 12³ = 9³ + 10³.")
21+
22+
# bytes正则表达式只能搜索bytes字符串
23+
text_bytes = text_str.encode('utf_8')
24+
25+
print(f'Text\n {text_str!r}')
26+
print('Numbers')
27+
# str模式r'\d+'只能匹配泰米尔数值和ASCII数字
28+
print(' str :', re_numbers_str.findall(text_str))
29+
# bytes模式rb'\d+'只能匹配ASCII字节中的数字
30+
print(' bytes:', re_numbers_bytes.findall(text_bytes))
31+
print('Words')
32+
# str模式r'\w+'能匹配字母、上标、泰米尔数字和ASCII数字
33+
print(' str :', re_words_str.findall(text_str))
34+
# bytes模式rb'\w+'只能匹配ASCII字节中的字母和数字
35+
print(' bytes:', re_words_bytes.findall(text_bytes))

codes/ch04/simplify.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
#!/usr/bin/env python
2+
# encoding: utf-8
3+
"""
4+
@author: HuRuiFeng
5+
@file: simplify.py
6+
@time: 2023/9/18 14:14
7+
@project: fluent-python
8+
@desc: P109 去掉全部组合记号的函数
9+
"""
10+
import string
11+
12+
import unicodedata
13+
14+
15+
def shave_marks(txt):
16+
"""删除所有变音符"""
17+
# 把所有字符分解成基字符和组合记号
18+
norm_txt = unicodedata.normalize('NFD', txt)
19+
# 过滤所有组合记号
20+
shaved = ''.join(c for c in norm_txt
21+
if not unicodedata.combining(c))
22+
# 重组所有字符
23+
return unicodedata.normalize('NFC', shaved)
24+
25+
26+
def shave_marks_latin(txt):
27+
"""删除所有拉丁基字符上的变音符"""
28+
norm_txt = unicodedata.normalize('NFD', txt)
29+
latin_base = False
30+
preserve = []
31+
for c in norm_txt:
32+
if unicodedata.combining(c) and latin_base:
33+
continue # 忽略拉丁基字符的变音符
34+
preserve.append(c)
35+
# 如果不是组合字符,那就是新的基字符
36+
if not unicodedata.combining(c):
37+
latin_base = c in string.ascii_letters
38+
shaved = ''.join(preserve)
39+
return unicodedata.normalize('NFC', shaved)
40+
41+
42+
single_map = str.maketrans("""‚ƒ„ˆ‹‘’“”•–—˜›""", # <1>
43+
"""'f"^<''""---~>""")
44+
45+
multi_map = str.maketrans({ # <2>
46+
'€': 'EUR',
47+
'…': '...',
48+
'Æ': 'AE',
49+
'æ': 'ae',
50+
'Œ': 'OE',
51+
'œ': 'oe',
52+
'™': '(TM)',
53+
'‰': '<per mille>',
54+
'†': '**',
55+
'‡': '***',
56+
})
57+
58+
multi_map.update(single_map) # <3>
59+
60+
61+
def dewinize(txt):
62+
"""把cp1252符号替换为ASCII字符或字符序列"""
63+
return txt.translate(multi_map)
64+
65+
66+
def asciize(txt):
67+
# 去掉变音符
68+
no_marks = shave_marks_latin(dewinize(txt))
69+
no_marks = no_marks.replace('ß', 'ss')
70+
# 使用NFKC规范化形式把字符和码点组合起来
71+
return unicodedata.normalize('NFKC', no_marks)
72+
73+
74+
if __name__ == '__main__':
75+
order = '“Herr Voß: • ½ cup of Œtker™ caffè latte • bowl of açaí.”'
76+
print(shave_marks(order))
77+
greek = 'Ζέφυρος, Zéfiro'
78+
print(shave_marks(greek))
79+
80+
print(dewinize(order))
81+
print(asciize(order))

0 commit comments

Comments
 (0)