PDF文档翻译

PDF文档全文翻译,保留原有的PDF格式

1. 采用 pymupdf 识别 pdf 的图片和文字

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import fitz
import re
from pprint import pprint

pdf_name = 'xxx-en.pdf'
print(f'Source pdf file: {pdf_name} \n')
src_pdf = fitz.open(pdf_name)
new_pdf = fitz.open()

for p, page in enumerate(src_pdf):
print(f'\n- translating PAGE -{p}- ...')

# 1.1 创建大小相同的新页面
new_page = new_pdf.new_page(width=page.rect.width, height=page.rect.height)

blocks = page.get_text('dict')['blocks']

# 1.2 图片
img_blks = [b for b in blocks if b['type'] == 1]
for img in img_blks:
# pprint(img)
new_page.insert_image(img['bbox'], stream=img['image'])

# 1.3 文字
txt_blks = [b for b in blocks if b['type'] != 1]
for txt in txt_blks:
text_tmp = ''.join([s['text'] for l in txt['lines'] for s in l['spans']])
text_tmp = re.sub('[@#$%^&*\'\"\n\r\t]', ' ', text_tmp).strip()

if text_tmp:
# print(txt['bbox'], text_tmp)

text_translate = '中国 ' + text_tmp
# text_translate = youdao(text_tmp)
new_page.insert_textbox(txt['bbox'], text_translate,
fontsize=6,
fontname='simhei',
fontfile=r'C:\Windows\Fonts\simhei.ttf')

# if p == 1:
# break

new_name = pdf_name.replace('.pdf', '-zh.pdf')
new_pdf.save(new_name)

print('\n------Done!-------')

2. 有道翻译

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# %%
# %%
import requests
import json
import time


def youdao(en_txt=''):
api_url = 'http://fanyi.youdao.com/translate?&i={}&doctype=json'

res = requests.get(api_url.format(en_txt)).json()
time.sleep(3.0)
# print(res)

zh_txt = ''.join([seq['tgt'] for seq in res['translateResult'][0]])

print(f'*** {en_txt} \n--> {zh_txt}')


return zh_txt


en_txt = 'so we beat on, boats against the current, borne back ceaselessly into the past.'
youdao(en_txt)

3. 百度翻译

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# %%
import requests
import random
import json
from hashlib import md5
import time


# ref: https://api.fanyi.baidu.com/doc/
appid = '2222222222222222'
appkey = 'ooooooooooooooooooo'

# Generate salt and sign
def make_md5(s, encoding='utf-8'):
return md5(s.encode(encoding)).hexdigest()


def baidu(en_txt=''):
salt = random.randint(32768, 65536)
sign = make_md5(appid + en_txt + str(salt) + appkey)

api_url = 'http://api.fanyi.baidu.com/api/trans/vip/translate'
headers = {'Content-Type': 'application/x-www-form-urlencoded'}
payload = {'appid': appid, 'q': en_txt, 'from': 'en', 'to': 'zh', 'salt': salt, 'sign': sign}

res = requests.get(api_url, params=payload, headers=headers, timeout=3.0).json()
time.sleep(3.0)

if 'trans_result' in res.keys():
zh_txt = ''.join([seq['dst'] for seq in res['trans_result']])
print(f'*** {en_txt} \n--> {zh_txt}')
return zh_txt

if 'error_code' in res.keys():
print(f'*** {en_txt} \n??? {res}')


en_txt = 'so we beat on, boats against the current, borne back ceaselessly into the past.'
baidu(en_txt)