1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102
| import re import fitz from rich.pretty import pprint
def fapiao_txt_reg(text=""): regs = dict( type=r"(电子发票.*[普通发票|增值税专用发票].*)", numb=r"发票号码\s*[::]\s*(\d{20})", date=r"开票日期\s*[::]\s*(\d{4}年\d{2}月\d{2}日)", item=r"(\*[\u4e00-\u9fff]+\*[\u4e00-\u9fff|\w]+)", totl=r"价税合计[\s\S]*[¥¥](\d+\.\d+)", ) result = {} for key, reg in regs.items(): match = re.search(reg, text) if match: result[key] = match.group(1) return result
fapiao = "digital_12345678901234567890.pdf" print(f"filename: {fapiao}")
src_pdf = fitz.open(fapiao) for p, page in enumerate(src_pdf): blks = page.get_text("words", sort=True) text = "\n".join(blk[4] for blk in blks) print(f"- page {p}") pprint(text.split('\n')) src_pdf.close()
fapiao_infos = fapiao_txt_reg(text) pprint(fapiao_infos)
from PIL import Image from io import BytesIO from pyzbar import pyzbar
src_pdf = fitz.open(fapiao) print(f"filename: {fapiao}")
for p, page in enumerate(src_pdf): img_list = page.get_images() for img in img_list: print(f"- page {p}, ", img) if img[2] != img[3]: continue pixmap = src_pdf.extract_image(img[0]) pprint(pixmap)
qr_img = Image.open(BytesIO(pixmap["image"]))
qr_decoded = pyzbar.decode(qr_img) pprint(qr_decoded)
if qr_decoded: qr_data = qr_decoded[0].data.decode("utf-8") pprint(qr_data)
src_pdf.close()
keys = ["type", "aera", "x", "numb", "totl", "date", "y", "code"] fapiao_infos = dict(zip(keys, qr_data.split(","))) pprint(fapiao_infos)
src_pdf = fitz.open(fapiao)
page = src_pdf[0]
tabs = page.find_tables().tables
table = tabs[0].extract()
for row in table: pprint(row)
for row in table: for col in row: if col: print(col.replace("\n", ""), end='\n') print()
|