一个在IT行业摸爬打滚的程序猿

0%

从pdf提取图片,有两个库可以提取fitz(要install pymupdf)、pdfminer(install pdfminer3k)

前言:下面的代码有注释,下面的也是参考别人的,但是忘记来源了,因为时间隔太久了,哪位小伙伴看到了,可以提一下链接

方法一,这个测试是有用的,fitz(要install pymupdf)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2019/3/19 08:51
# @Author : qizai
# @File : fetch_pdf_v1.py
# @Software: PyCharm

import fitz # pip3 install pymupdf
import time
import re
import os


def get_image(path, pic_path):
'''从pdf中提取图片
:param path: pdf的路径
:param pic_path: 图片保存的路径
:return: 无return
'''
t0 = time.clock()
# 00、使用正则表达式查找PDF中的图片
checkXO = r"/Type(?= */XObject)"
checkIM = r"/Subtype(?= */Image)"

# 一、打开pdf,打印PDF的相关信息
doc = fitz.open(path)
# 图片计数
imgcount = 0
lenXREF = doc._getXrefLength()

# 打印PDF的信息
print("文件名:{}, 页数: {}, 对象: {}".format(path, len(doc), lenXREF - 1))

# 二、遍历PDF中的对象,遇到是图像才进行下一步,不然就continue
for i in range(1, lenXREF):
# 定义对象字符串
# text = doc._getObjectString(i)
text = doc._getXrefString(i)
# print(text)
# continue
isXObject = re.search(checkXO, text)
# 使用正则表达式查看是否是图片
isImage = re.search(checkIM, text)
# 如果不是对象也不是图片,则continue
if not isXObject or not isImage:
print("不是图片")
continue
imgcount += 1
# 根据索引生成图像
pix = fitz.Pixmap(doc, i)
# 根据pdf的路径生成图片的名称
new_name = path.replace('\\', '_') + "_img{}.png".format(imgcount)
new_name = new_name.replace(':', '')

# 三、将图像存为png格式
# 如果pix.n<5,可以直接存为PNG
if pix.n < 5:
pix.writePNG(os.path.join(pic_path, new_name))
# 否则先转换CMYK
else:
pix0 = fitz.Pixmap(fitz.csRGB, pix)
pix0.writePNG(os.path.join(pic_path, new_name))
pix0 = None
# 释放资源
pix = None
t1 = time.clock()
print("运行时间:{}s".format(t1 - t0))
print("提取了{}张图片".format(imgcount))


# 运行
if __name__=='__main__':
# pdf路径
path = r"Selenium 自动化爬虫.pdf" # 测试提取图片专用 Selenium 自动化爬虫
pic_path = r"image"
# 创建保存图片的文件夹
if os.path.exists(pic_path):
print("文件夹已存在,请重新创建新文件夹!")
raise SystemExit
else:
os.mkdir(pic_path)
get_image(path, pic_path)
方法二、pdfminer(install pdfminer3k)这个提取文字还可以,提取图片暂时识别不了
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time : 2019/3/19 11:21
# @Author : qizai
# @File : fetch_pdf_v2.py
# @Software: PyCharm

from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, PDFTextExtractionNotAllowed
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTImage, LTCurve, LTFigure

# 如果要提取图片,那么需要导入一下几个库
import sys
import os
from binascii import b2a_hex

images_folder = r'image' # 提取图片存放的路径


def save_image(lt_image, page_number, images_folder):
"""Try to save the image data from this LTImage object, and return the file name, if successful
从LTImage object中保存图像数据,这需要依赖两个函数determine_image_type、write_file
:return file_name 文件名
"""
result = None
if lt_image.stream:
file_stream = lt_image.stream.get_rawdata()
file_ext = determine_image_type(file_stream[0:4])
if file_ext:
file_name = ''.join([str(page_number), '_', lt_image.name, file_ext])
if write_file(images_folder, file_name, lt_image.stream.get_rawdata(), flags='wb'):
result = file_name
return result


def determine_image_type (stream_first_4_bytes):
"""Find out the image file type based on the magic number comparison of the first 4 (or 2) bytes"""
file_type = None
bytes_as_hex = b2a_hex(stream_first_4_bytes)
if bytes_as_hex.startswith('ffd8'):
file_type = '.jpeg'
elif bytes_as_hex == '89504e47':
file_type = ',png'
elif bytes_as_hex == '47494638':
file_type = '.gif'
elif bytes_as_hex.startswith('424d'):
file_type = '.bmp'
return file_type


def write_file(folder, filename, filedata, flags='w'):
"""Write the file data to the folder and filename combination
(flags: 'w' for write text, 'wb' for write binary, use 'a' instead of 'w' for append)
:param flags:写入文件格式(w,wb,a) floder
将文件数据写入文件夹和文件名组合
:return
"""
result = False
if os.path.isdir(folder):
try:
file_obj = open(os.path.join(folder, filename), flags)
file_obj.write(filedata)
file_obj.close()
result = True
except IOError as e:
print("报错了,报错信息如下:\n{}".format(e))
return result


def get_read(path, toPath, image_path):
# 以二进制形式打开pdf文件
with open(path, "rb") as f:
# 创建一个pdf文档分析器
parser = PDFParser(f)
# 创建pdf文档
pdfFile = PDFDocument()
# 链接分析器与文档对象
parser.set_document(pdfFile)
pdfFile.set_parser(parser)
# 提供初始化密码
pdfFile.initialize()
# 检测文档是否提供txt转换
if not pdfFile.is_extractable:
print("不提供txt转换")
raise PDFTextExtractionNotAllowed
else:
# 解析数据
# 数据管理
manager = PDFResourceManager()
# 创建一个PDF设备对象
laparams = LAParams()
device = PDFPageAggregator(manager, laparams=laparams)
# 解释器对象
interpreter = PDFPageInterpreter(manager, device)

num_page, num_image, num_curve, num_figure, num_TextBoxHorizontal = 0, 0, 0, 0, 0

# 循环遍历列表,每次处理一个page的内容
for page in pdfFile.get_pages(): # doc.get_pages() 获取page列表
num_page += 1 # 页面增一
interpreter.process_page(page)
# 接收该页面的LTPage对象
layout = device.get_result()
for content in layout:
print(content)
if isinstance(content, LTImage): # 图片对象
num_image += 1
if isinstance(content, LTCurve): # 曲线对象
num_curve += 1
if isinstance(content, LTFigure): # figure对象
num_figure += 1

# 也可以判断是否含有get_text()方法。图片之类的就没有
# if hasattr(out,"get_text"):
if isinstance(content, LTTextBoxHorizontal):
with open(toPath, "a", encoding='utf-8') as f:
str = content.get_text()
# print(str)
f.write(str)

if isinstance(content, LTImage):
print("检测到图片")
with open(image_path, "wb") as f:
im_name = content.name
im_with = content.width
im_height = content.height
im_name = content.imagemask
try:
save_image(content, num_page, images_folder)
print("提取正常")
except Exception as e:
print("图片提取异常", e)
print('对象数量:\n', '页面数:%s\n' % num_page, '图片数:%s\n' % num_image, '曲线数:%s\n' % num_curve, '水平文本框:%s\n'
% num_TextBoxHorizontal)


if __name__=='__main__':
path = r"Selenium 自动化爬虫.pdf" # 测试提取图片pdf
to_path = r"读取pdf文件03.txt"
image_path = r"帅哥.png" # 这里只是提取报存一张,如果你是提取很多的话,这个得需要改一下
get_read(path, to_path, image_path)
前面的代码都是经过我自己测试的,但写文章的时候稍有改动,但不影响运行,微薄之力,希望能够帮助到你

感谢阅读!