将动态的PDF文件转换成静态的PDF文件
说明: 这里将动态PDF文件转换成静态的PDF文件是通过第三方网站转的,靠自己去转比较困难(我还没有研究透)。我是通过爬第三方网站的方式转换的。 第三方网址:http://speedtesting.herokuapp.com/pdfxfa/
爬第三方网站步骤:
1.需要用到的包:
import time
import requests
# 导入requests_toolbelt库使用MultipartEncoder
from requests_toolbelt import MultipartEncoder
2.模拟上传动态PDF:
def upload_pdf(localfile):
url = 'https://speedtesting.herokuapp.com/pdfxfa/upload.php'
headers = {
'Origin': "https://speedtesting.herokuapp.com",
# 'Accept-Encoding': "gzip, deflate, br",
'Accept-Language': "zh-CN,zh;q=0.9,en;q=0.8",
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/72.0.3626.121 Safari/537.36",
'Content-Type': "multipart/form-data; boundary=----WebKitFormBoundaryxAAua9MgkUszgIdT",
'Accept': "*/*",
'Referer': "https://speedtesting.herokuapp.com/pdfxfa/",
# 'Cookie': "cc_gotit=1",
'Connection': "keep-alive",
'DNT': "1",
'cache-control': "no-cache",
}
# file_payload = {'file': open('./financial.pdf', 'rb')}
file_payload = {'file1': (localfile, open(localfile, 'rb')), 'name': 'financial.pdf'}
# 生成可用于multipart/form-data上传的数据
m = MultipartEncoder(file_payload)
# 自动生成Content-Type类型和随机码
headers['Content-Type'] = m.content_type
# 使用data上传文件
html = requests.post(url, headers=headers, data=m)
print(html.json())
return html.json()
3.模拟开始转换:
def xfa_to_normal(resultFile):
url = 'https://speedtesting.herokuapp.com/pdfxfa/upload.php?action=getresults'
headers = {
'Origin': "https://speedtesting.herokuapp.com",
# 'Accept-Encoding': "gzip, deflate, br",
'Accept-Language': "zh-CN,zh;q=0.9,en;q=0.8",
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/72.0.3626.121 Safari/537.36",
'Content-Type': "multipart/form-data; boundary=----WebKitFormBoundaryxAAua9MgkUszgIdT",
'Accept': "*/*",
'Referer': "https://speedtesting.herokuapp.com/pdfxfa/",
# 'Cookie': "cc_gotit=1",
'Connection': "keep-alive",
'DNT': "1",
'cache-control': "no-cache",
}
# file_payload = {'file': open('./financial.pdf', 'rb')}
file_payload = {'resultfile': resultFile}
# 生成可用于multipart/form-data上传的数据
m = MultipartEncoder(file_payload)
# 自动生成Content-Type类型和随机码
headers['Content-Type'] = m.content_type
# 使用data上传文件
html = requests.post(url, headers=headers, data=m)
print(html.json())
return html.json()
4.下载转换好的PDF文件:
def downloadFile(name, url):
headers = {'Proxy-Connection': 'keep-alive'}
r = requests.get(url, stream=True, headers=headers)
length = float(r.headers['content-length'])
f = open(name, 'wb')
count = 0
count_tmp = 0
time1 = time.time()
for chunk in r.iter_content(chunk_size=512):
if chunk:
f.write(chunk)
count += len(chunk)
if time.time() - time1 > 2:
p = count / length * 100
speed = (count - count_tmp) / 1024 / 1024 / 2
count_tmp = count
print(name + ': ' + formatFloat(p) + '%' + ' Speed: ' + formatFloat(speed) + 'M/S')
time1 = time.time()
f.close()