🎉 学习第4章Unicode文本和字节序列

Relph1119 · Relph1119 · commit f1253c5c794e · 2023-09-18T16:09:06.000+08:00
diff --git a/README.md b/README.md
@@ -0,0 +1,3 @@
+# 《流畅的Python》阅读笔记
+
+原书的项目地址：https://github.com/fluentpython/example-code-2e
diff --git a/codes/ch04/default_encoding.py b/codes/ch04/default_encoding.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python
+# encoding: utf-8
+"""
+@author: HuRuiFeng
+@file: default_encoding.py
+@time: 2023/9/18 13:09
+@project: fluent-python
+@desc: P100 探索默认编码
+"""
+
+import locale
+import sys
+
+expressions = """
+        locale.getpreferredencoding()
+        type(my_file)
+        my_file.encoding
+        sys.stdout.isatty()
+        sys.stdout.encoding
+        sys.stdin.isatty()
+        sys.stdin.encoding
+        sys.stderr.isatty()
+        sys.stderr.encoding
+        sys.getdefaultencoding()
+        sys.getfilesystemencoding()
+    """
+
+my_file = open('dummy', 'w')
+
+for expression in expressions.split():
+    value = eval(expression)
+    print(f'{expression:>30} -> {value!r}')
+
diff --git a/codes/ch04/normeq.py b/codes/ch04/normeq.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python
+# encoding: utf-8
+"""
+@author: HuRuiFeng
+@file: normeq.py
+@time: 2023/9/18 14:11
+@project: fluent-python
+@desc: P108 规范化Unicode字符串，准确比较
+"""
+from unicodedata import normalize
+
+
+def nfc_equal(str1, str2):
+    return normalize('NFC', str1) == normalize('NFC', str2)
+
+
+def fold_equal(str1, str2):
+    return (normalize('NFC', str1).casefold() ==
+            normalize('NFC', str2).casefold())
+
+
+if __name__ == '__main__':
+    s1 = 'café'
+    s2 = 'cafe\u0301'
+    print(s1 == s2)
+    print(nfc_equal(s1, s2))
+    print(nfc_equal('A', 'a'))
diff --git a/codes/ch04/ramanujan.py b/codes/ch04/ramanujan.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python
+# encoding: utf-8
+"""
+@author: HuRuiFeng
+@file: ramanujan.py
+@time: 2023/9/18 15:56
+@project: fluent-python
+@desc: P117 比较简单的str和bytes正则表达式的行为
+"""
+import re
+
+# str类型
+re_numbers_str = re.compile(r'\d+')
+re_words_str = re.compile(r'\w+')
+# bytes类型
+re_numbers_bytes = re.compile(rb'\d+')
+re_words_bytes = re.compile(rb'\w+')
+
+text_str = ("Ramanujan saw \u0be7\u0bed\u0be8\u0bef"
+            " as 1729 = 1³ + 12³ = 9³ + 10³.")
+
+# bytes正则表达式只能搜索bytes字符串
+text_bytes = text_str.encode('utf_8')
+
+print(f'Text\n  {text_str!r}')
+print('Numbers')
+# str模式r'\d+'只能匹配泰米尔数值和ASCII数字
+print('  str  :', re_numbers_str.findall(text_str))
+# bytes模式rb'\d+'只能匹配ASCII字节中的数字
+print('  bytes:', re_numbers_bytes.findall(text_bytes))
+print('Words')
+# str模式r'\w+'能匹配字母、上标、泰米尔数字和ASCII数字
+print('  str  :', re_words_str.findall(text_str))
+# bytes模式rb'\w+'只能匹配ASCII字节中的字母和数字
+print('  bytes:', re_words_bytes.findall(text_bytes))
diff --git a/codes/ch04/simplify.py b/codes/ch04/simplify.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python
+# encoding: utf-8
+"""
+@author: HuRuiFeng
+@file: simplify.py
+@time: 2023/9/18 14:14
+@project: fluent-python
+@desc: P109 去掉全部组合记号的函数
+"""
+import string
+
+import unicodedata
+
+
+def shave_marks(txt):
+    """删除所有变音符"""
+    # 把所有字符分解成基字符和组合记号
+    norm_txt = unicodedata.normalize('NFD', txt)
+    # 过滤所有组合记号
+    shaved = ''.join(c for c in norm_txt
+                     if not unicodedata.combining(c))
+    # 重组所有字符
+    return unicodedata.normalize('NFC', shaved)
+
+
+def shave_marks_latin(txt):
+    """删除所有拉丁基字符上的变音符"""
+    norm_txt = unicodedata.normalize('NFD', txt)
+    latin_base = False
+    preserve = []
+    for c in norm_txt:
+        if unicodedata.combining(c) and latin_base:
+            continue  # 忽略拉丁基字符的变音符
+        preserve.append(c)
+        # 如果不是组合字符，那就是新的基字符
+        if not unicodedata.combining(c):
+            latin_base = c in string.ascii_letters
+    shaved = ''.join(preserve)
+    return unicodedata.normalize('NFC', shaved)
+
+
+single_map = str.maketrans("""‚ƒ„ˆ‹‘’“”•–—˜›""",  # <1>
+                           """'f"^<''""---~>""")
+
+multi_map = str.maketrans({  # <2>
+    '€': 'EUR',
+    '…': '...',
+    'Æ': 'AE',
+    'æ': 'ae',
+    'Œ': 'OE',
+    'œ': 'oe',
+    '™': '(TM)',
+    '‰': '<per mille>',
+    '†': '**',
+    '‡': '***',
+})
+
+multi_map.update(single_map)  # <3>
+
+
+def dewinize(txt):
+    """把cp1252符号替换为ASCII字符或字符序列"""
+    return txt.translate(multi_map)
+
+
+def asciize(txt):
+    # 去掉变音符
+    no_marks = shave_marks_latin(dewinize(txt))
+    no_marks = no_marks.replace('ß', 'ss')
+    # 使用NFKC规范化形式把字符和码点组合起来
+    return unicodedata.normalize('NFKC', no_marks)
+
+
+if __name__ == '__main__':
+    order = '“Herr Voß: • ½ cup of Œtker™ caffè latte • bowl of açaí.”'
+    print(shave_marks(order))
+    greek = 'Ζέφυρος, Zéfiro'
+    print(shave_marks(greek))
+
+    print(dewinize(order))
+    print(asciize(order))
diff --git a/notes/ch04.ipynb b/notes/ch04.ipynb

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# 《流畅的Python》阅读笔记`
	`2`	`+`
	`3`	`+原书的项目地址：https://github.com/fluentpython/example-code-2e`