0%

从pdf提取图片，有两个库可以提取fitz（要install pymupdf）、pdfminer（install pdfminer3k）

发表于 2019-08-06 更新于 2019-11-30 分类于 Python

前言:下面的代码有注释，下面的也是参考别人的，但是忘记来源了，因为时间隔太久了，哪位小伙伴看到了，可以提一下链接

方法一，这个测试是有用的，fitz（要install pymupdf）

#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time    : 2019/3/19 08:51
# @Author  : qizai
# @File    : fetch_pdf_v1.py
# @Software: PyCharm

import fitz  # pip3 install pymupdf
import time
import re
import os


def get_image(path, pic_path):
    '''从pdf中提取图片
    :param path: pdf的路径
    :param pic_path: 图片保存的路径
    :return: 无return
    '''
    t0 = time.clock()
    # 00、使用正则表达式查找PDF中的图片
    checkXO = r"/Type(?= */XObject)"
    checkIM = r"/Subtype(?= */Image)"

    # 一、打开pdf，打印PDF的相关信息
    doc = fitz.open(path)
    # 图片计数
    imgcount = 0
    lenXREF = doc._getXrefLength()

    # 打印PDF的信息
    print("文件名:{}, 页数: {}, 对象: {}".format(path, len(doc), lenXREF - 1))

    # 二、遍历PDF中的对象，遇到是图像才进行下一步，不然就continue
    for i in range(1, lenXREF):
        # 定义对象字符串
        # text = doc._getObjectString(i)
        text = doc._getXrefString(i)
        # print(text)
        # continue
        isXObject = re.search(checkXO, text)
        # 使用正则表达式查看是否是图片
        isImage = re.search(checkIM, text)
        # 如果不是对象也不是图片，则continue
        if not isXObject or not isImage:
            print("不是图片")
            continue
        imgcount += 1
        # 根据索引生成图像
        pix = fitz.Pixmap(doc, i)
        # 根据pdf的路径生成图片的名称
        new_name = path.replace('\\', '_') + "_img{}.png".format(imgcount)
        new_name = new_name.replace(':', '')

        # 三、将图像存为png格式
        # 如果pix.n<5,可以直接存为PNG
        if pix.n < 5:
            pix.writePNG(os.path.join(pic_path, new_name))
        # 否则先转换CMYK
        else:
            pix0 = fitz.Pixmap(fitz.csRGB, pix)
            pix0.writePNG(os.path.join(pic_path, new_name))
            pix0 = None
        # 释放资源
        pix = None
    t1 = time.clock()
    print("运行时间:{}s".format(t1 - t0))
    print("提取了{}张图片".format(imgcount))


# 运行
if __name__=='__main__':
    # pdf路径
    path = r"Selenium 自动化爬虫.pdf"  # 测试提取图片专用  Selenium 自动化爬虫
    pic_path = r"image"
    # 创建保存图片的文件夹
    if os.path.exists(pic_path):
        print("文件夹已存在，请重新创建新文件夹！")
        raise SystemExit
    else:
        os.mkdir(pic_path)
    get_image(path, pic_path)

方法二、pdfminer（install pdfminer3k）这个提取文字还可以，提取图片暂时识别不了

#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Time    : 2019/3/19 11:21
# @Author  : qizai
# @File    : fetch_pdf_v2.py
# @Software: PyCharm

from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, PDFTextExtractionNotAllowed
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTImage, LTCurve, LTFigure

# 如果要提取图片，那么需要导入一下几个库
import sys
import os
from binascii import b2a_hex

images_folder = r'image'  # 提取图片存放的路径


def save_image(lt_image, page_number, images_folder):
    """Try to save the image data from this LTImage object, and return the file name, if successful
    从LTImage object中保存图像数据，这需要依赖两个函数determine_image_type、write_file
    :return file_name 文件名
    """
    result = None
    if lt_image.stream:
        file_stream = lt_image.stream.get_rawdata()
        file_ext = determine_image_type(file_stream[0:4])
    if file_ext:
        file_name = ''.join([str(page_number), '_', lt_image.name, file_ext])
    if write_file(images_folder, file_name, lt_image.stream.get_rawdata(), flags='wb'):
        result = file_name
    return result


def determine_image_type (stream_first_4_bytes):
    """Find out the image file type based on the magic number comparison of the first 4 (or 2) bytes"""
    file_type = None
    bytes_as_hex = b2a_hex(stream_first_4_bytes)
    if bytes_as_hex.startswith('ffd8'):
        file_type = '.jpeg'
    elif bytes_as_hex == '89504e47':
        file_type = ',png'
    elif bytes_as_hex == '47494638':
        file_type = '.gif'
    elif bytes_as_hex.startswith('424d'):
        file_type = '.bmp'
    return file_type


def write_file(folder, filename, filedata, flags='w'):
    """Write the file data to the folder and filename combination
    (flags: 'w' for write text, 'wb' for write binary, use 'a' instead of 'w' for append)
    :param flags:写入文件格式(w,wb,a) floder
    将文件数据写入文件夹和文件名组合
    :return
    """
    result = False
    if os.path.isdir(folder):
        try:
            file_obj = open(os.path.join(folder, filename), flags)
            file_obj.write(filedata)
            file_obj.close()
            result = True
        except IOError as e:
            print("报错了,报错信息如下:\n{}".format(e))
    return result


def get_read(path, toPath, image_path):
    # 以二进制形式打开pdf文件
    with open(path, "rb") as f:
        # 创建一个pdf文档分析器
        parser = PDFParser(f)
        # 创建pdf文档
        pdfFile = PDFDocument()
        # 链接分析器与文档对象
        parser.set_document(pdfFile)
        pdfFile.set_parser(parser)
        # 提供初始化密码
        pdfFile.initialize()
        # 检测文档是否提供txt转换
    if not pdfFile.is_extractable:
        print("不提供txt转换")
        raise PDFTextExtractionNotAllowed
    else:
        # 解析数据
        # 数据管理
        manager = PDFResourceManager()
        # 创建一个PDF设备对象
        laparams = LAParams()
        device = PDFPageAggregator(manager, laparams=laparams)
        # 解释器对象
        interpreter = PDFPageInterpreter(manager, device)

        num_page, num_image, num_curve, num_figure, num_TextBoxHorizontal = 0, 0, 0, 0, 0

        # 循环遍历列表，每次处理一个page的内容
        for page in pdfFile.get_pages():  # doc.get_pages() 获取page列表
            num_page += 1  # 页面增一
            interpreter.process_page(page)
            # 接收该页面的LTPage对象
            layout = device.get_result()
            for content in layout:
                print(content)
                if isinstance(content, LTImage):  # 图片对象
                    num_image += 1
                if isinstance(content, LTCurve):  # 曲线对象
                    num_curve += 1
                if isinstance(content, LTFigure):  # figure对象
                    num_figure += 1

                # 也可以判断是否含有get_text()方法。图片之类的就没有
                # if hasattr(out,"get_text"):
                if isinstance(content, LTTextBoxHorizontal):
                    with open(toPath, "a", encoding='utf-8') as f:
                        str = content.get_text()
                        # print(str)
                        f.write(str)

                if isinstance(content, LTImage):
                    print("检测到图片")
                    with open(image_path, "wb") as f:
                        im_name = content.name
                        im_with = content.width
                        im_height = content.height
                        im_name = content.imagemask
                try:
                    save_image(content, num_page, images_folder)
                    print("提取正常")
                except Exception as e:
                    print("图片提取异常", e)
            print('对象数量：\n', '页面数：%s\n' % num_page, '图片数：%s\n' % num_image, '曲线数：%s\n' % num_curve, '水平文本框：%s\n'
                  % num_TextBoxHorizontal)


if __name__=='__main__':
    path = r"Selenium 自动化爬虫.pdf"  # 测试提取图片pdf 
	to_path = r"读取pdf文件03.txt"
	image_path = r"帅哥.png"  # 这里只是提取报存一张，如果你是提取很多的话，这个得需要改一下
	get_read(path, to_path, image_path)

前面的代码都是经过我自己测试的，但写文章的时候稍有改动，但不影响运行，微薄之力，希望能够帮助到你

感谢阅读！