PDF文档-电子发票信息提取

用 pymupdf 提取电子发票中的信息

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# %%
import re
import fitz # pip install PyMuPDF
from rich.pretty import pprint


def fapiao_txt_reg(text=""):
# 正则规则
regs = dict(
type=r"(电子发票.*[普通发票|增值税专用发票].*)",
numb=r"发票号码\s*[::]\s*(\d{20})",
date=r"开票日期\s*[::]\s*(\d{4}年\d{2}月\d{2}日)",
item=r"(\*[\u4e00-\u9fff]+\*[\u4e00-\u9fff|\w]+)",
totl=r"价税合计[\s\S]*[¥¥](\d+\.\d+)",
)
result = {}
for key, reg in regs.items():
match = re.search(reg, text)
if match:
result[key] = match.group(1)
return result


# 1. 获取页面中的文字,从文字中正则提取
fapiao = "digital_12345678901234567890.pdf"
print(f"filename: {fapiao}")

src_pdf = fitz.open(fapiao)
for p, page in enumerate(src_pdf):
blks = page.get_text("words", sort=True)
text = "\n".join(blk[4] for blk in blks)
print(f"- page {p}")
pprint(text.split('\n'))
src_pdf.close()

fapiao_infos = fapiao_txt_reg(text)
pprint(fapiao_infos)


# %%
from PIL import Image
from io import BytesIO
from pyzbar import pyzbar


# 2. 获取页面中的二维码图片-解析二维码
src_pdf = fitz.open(fapiao)
print(f"filename: {fapiao}")

for p, page in enumerate(src_pdf):
img_list = page.get_images()
for img in img_list:
print(f"- page {p}, ", img)

# 跳过长宽不等的图片
if img[2] != img[3]:
continue

# 转为 pillow 图片
pixmap = src_pdf.extract_image(img[0])
pprint(pixmap)

qr_img = Image.open(BytesIO(pixmap["image"]))
# qr_img.show()

qr_decoded = pyzbar.decode(qr_img)
pprint(qr_decoded)

if qr_decoded:
qr_data = qr_decoded[0].data.decode("utf-8")
pprint(qr_data)

src_pdf.close()

# 二维码的字段定义不太确定 ?
keys = ["type", "aera", "x", "numb", "totl", "date", "y", "code"]
fapiao_infos = dict(zip(keys, qr_data.split(",")))
pprint(fapiao_infos)

# %%

# 3. 获取页面中的表格

src_pdf = fitz.open(fapiao)

page = src_pdf[0]

tabs = page.find_tables().tables

table = tabs[0].extract()

for row in table:
pprint(row)

for row in table:
for col in row:
if col:
print(col.replace("\n", ""), end='\n')
print()

# %%