前八周
1. 字的统计
defaultdict 是一个字典的子类,为查询提供默认值
from collections import defaultdict
d = defaultdict(int)
with open('test.txt','r') as f:
for line in f:
line = line.strip()
for c in line:
d[c] += 1
for k, v in sorted(d.items(), key = lambda x:x[1], reverse = True): #降序输出
print(k,v)
2. 简易查单词
注意文本的编码格式,一般打开txt,编码格式出现在右下角
from collections import defaultdict
import re
d = defaultdict(int)
with open("dict.txt", "r", encoding="ANSI") as f:
for line in f:
line = line.strip()
w, t = line.split("=>")
d[w] = t
# for k, v in d.items():
# print(k, v)
while True:
s = input("Word: ")
if s in ['q','Q']:
break
if d.get(s):
res = re.sub("@","\n",d[s])
print(res)
3. 汉字编码
汉字编码表中存在空缺,所以用 try 来跳过报错
import struct
cnt = 0
for H in range(0xa1,0xf7):
for L in range(0xa1,0xfe):
try:
word = struct.pack("BB",H,L)
word = word.decode("gbk")
print(word)
except:
cnt += 1
print("有{}个错误".format(cnt))
4. 小作业:汉字字频统计,半角转全角
一级常用汉字:3755个,编码为:
0xB0A0
~0xD7F9
print 函数有 file 参数
做法1:
import struct
from collections import defaultdict
def fun(char): #半角转全角,这代码抄的
inside_code=ord(char)
if inside_code<0x0020 or inside_code>0x7e: #不是半角字符就返回原来的字符
return char
if inside_code==0x0020: #除了空格其他的全角半角的公式为:半角 = 全角-0xfee0
inside_code = 0x3000
else:
inside_code += 0xfee0
return chr(inside_code)
all_cnt = 0
d = defaultdict(int)
with open("word.txt", "r", encoding="UTF-8") as f:
for line in f:
line = line.strip()
for char in line:
char = fun(char)
d[char] += 1
all_cnt += 1
for H in range(0xb0,0xd8):
for L in range(0xa0,0xfa):
word = struct.pack("BB",H,L)
word = word.decode("gbk")
print("{}, 字频为:{}".format(word, d[word]/all_cnt))
做法2:
import struct
from collections import defaultdict
def ban_to_quan(char):
Bytes = char.encode("gbk") #将字符,编码成字节流
if Bytes[0]&0x80 == 0:
tmp = struct.pack("BB", 0xa3, Bytes[0]+128)
res = tmp.decode("gbk") #将字节,解码成字符流
else:
res = char
return res
def convert(corpus, res):
d = defaultdict(int)
with open(corpus,"r") as i:
for line in i:
line = line.strip()
for c in line:
quan = ban_to_quan(c)
d[quan] += 1
with open(res,"w") as o:
for k,v in d.items():
Bytes = k.encode("gbk") # 一个汉字变为2个字节,4个十六进制位
if Bytes[0] < 0xd8:
print(k, v, file = o)
convert("test.txt","hzlist.txt")
5. 字节流初试
import struct
s = "我们在上课"
s = s.encode("gbk")
# for c in s:
# print("{:#x}".format(c))
with open("hzlist.txt", "wb") as f: # wb 表示以二进制格式打开,并从头开始编辑
for H in range(0xb0,0xd8):
for L in range(0xa0,0xff):
word = struct.pack("BB",H,L)
f.write(word)
f.write("\n".encode("gbk"))
6. 汉字点阵显示
汉字点阵显示,通常需要 16×16 的点阵,2^(16*16) = (2^8)^32,需要32个字节
import struct
KEYS = [0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01]
res = [] * 16
for i in range(16):
res.append([] * 16)
def LoadLib(path):
res = []
with open(path, "rb") as f:
for i in range(267616//32):
buffer = f.read(32)
dots = struct.unpack("32B", buffer)
res.append(dots)
return res
def GetZX(arr, HZ): #返回汉字在字库的 绝对偏移量
HZ = HZ.encode('gbk')
return arr[(HZ[0]-0xa1)*94+HZ[1]-0xa1]
def main(lib, hz):
arr = LoadLib(lib)
for h in hz:
zx = GetZX(arr, h)
for k, row in enumerate(res):
for j in range(2):
for i in range(8):
asc = zx[k * 2 + j]
flag = asc & KEYS[i]
row.append(flag)
for row in res:
for i in row:
if i:
print('*', end=' ')
else:
print(' ', end=' ')
print()
main("hzk.dat", "圆月") #会在一行显示
十一周
正向最大匹配分词
用的从前的代码
cidian = set()
with open("word.txt","r",encoding="utf-8") as f:
ci = f.readline().strip("\n")
while ci:
#print(len(ci),ci[0],ci[-1])
cidian.add(ci)
ci = f.readline().strip("\n")
def main(s):
n = len(s)
ans = []
l = 0
while l < n:
tmp = s[l]
for r in range(l,n):
cur_ci = s[l:r+1]
print(cur_ci,l,r+1)
if cur_ci in cidian:
tmp = cur_ci
ans.append(tmp)
l += len(tmp)
return ans
main("沧浪寄余生")
# set 查找效率近似于O(1),同 dict
# strip()去除字符首尾的指定字符
# cidian = set([i.strip() for i in open().readlines()])
沧 0 1
沧浪 0 2
沧浪寄 0 3
沧浪寄余 0 4
沧浪寄余生 0 5
寄 2 3
寄余 2 4
寄余生 2 5
余 3 4
余生 3 5
['沧浪', '寄', '余生']
十四周
讲了信息检索
英文词性标注
给定英文词性标注语料,采用HMM模型,实现词性标注,在console窗口下,交互提示输入英文,输出标注结果。
n-gram
n元语法模型指:基于(n-1)阶马尔科夫链的一种概率语言模型,通过n个语词出现的概率来推断语句的结构。
维特比算法
一种动态规划算法,用于寻找最有可能产生观测事件序列的-维特比路径-隐含状态序列
贝叶斯人名判别
给定人名训练语料,采用朴素贝叶斯分类模型,给定任意一个姓名,判断其性别。
朴素贝叶斯
朴素贝叶斯法是基于贝叶斯定理与特征条件独立假设的分类方法。
other
关于期末:
- 大作业
参考资料:
推荐阅读: