1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
| #!/usr/bin/python3 # -*- coding: utf-8 -*- # @Time : 2019/3/19 11:21 # @Author : qizai # @File : fetch_pdf_v2.py # @Software: PyCharm
from pdfminer.pdfparser import PDFParser, PDFDocument from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, PDFTextExtractionNotAllowed from pdfminer.converter import PDFPageAggregator from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTImage, LTCurve, LTFigure
# 如果要提取图片,那么需要导入一下几个库 import sys import os from binascii import b2a_hex
images_folder = r'image' # 提取图片存放的路径
def save_image(lt_image, page_number, images_folder): """Try to save the image data from this LTImage object, and return the file name, if successful 从LTImage object中保存图像数据,这需要依赖两个函数determine_image_type、write_file :return file_name 文件名 """ result = None if lt_image.stream: file_stream = lt_image.stream.get_rawdata() file_ext = determine_image_type(file_stream[0:4]) if file_ext: file_name = ''.join([str(page_number), '_', lt_image.name, file_ext]) if write_file(images_folder, file_name, lt_image.stream.get_rawdata(), flags='wb'): result = file_name return result
def determine_image_type (stream_first_4_bytes): """Find out the image file type based on the magic number comparison of the first 4 (or 2) bytes""" file_type = None bytes_as_hex = b2a_hex(stream_first_4_bytes) if bytes_as_hex.startswith('ffd8'): file_type = '.jpeg' elif bytes_as_hex == '89504e47': file_type = ',png' elif bytes_as_hex == '47494638': file_type = '.gif' elif bytes_as_hex.startswith('424d'): file_type = '.bmp' return file_type
def write_file(folder, filename, filedata, flags='w'): """Write the file data to the folder and filename combination (flags: 'w' for write text, 'wb' for write binary, use 'a' instead of 'w' for append) :param flags:写入文件格式(w,wb,a) floder 将文件数据写入文件夹和文件名组合 :return """ result = False if os.path.isdir(folder): try: file_obj = open(os.path.join(folder, filename), flags) file_obj.write(filedata) file_obj.close() result = True except IOError as e: print("报错了,报错信息如下:\n{}".format(e)) return result
def get_read(path, toPath, image_path): # 以二进制形式打开pdf文件 with open(path, "rb") as f: # 创建一个pdf文档分析器 parser = PDFParser(f) # 创建pdf文档 pdfFile = PDFDocument() # 链接分析器与文档对象 parser.set_document(pdfFile) pdfFile.set_parser(parser) # 提供初始化密码 pdfFile.initialize() # 检测文档是否提供txt转换 if not pdfFile.is_extractable: print("不提供txt转换") raise PDFTextExtractionNotAllowed else: # 解析数据 # 数据管理 manager = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(manager, laparams=laparams) # 解释器对象 interpreter = PDFPageInterpreter(manager, device)
num_page, num_image, num_curve, num_figure, num_TextBoxHorizontal = 0, 0, 0, 0, 0
# 循环遍历列表,每次处理一个page的内容 for page in pdfFile.get_pages(): # doc.get_pages() 获取page列表 num_page += 1 # 页面增一 interpreter.process_page(page) # 接收该页面的LTPage对象 layout = device.get_result() for content in layout: print(content) if isinstance(content, LTImage): # 图片对象 num_image += 1 if isinstance(content, LTCurve): # 曲线对象 num_curve += 1 if isinstance(content, LTFigure): # figure对象 num_figure += 1
# 也可以判断是否含有get_text()方法。图片之类的就没有 # if hasattr(out,"get_text"): if isinstance(content, LTTextBoxHorizontal): with open(toPath, "a", encoding='utf-8') as f: str = content.get_text() # print(str) f.write(str)
if isinstance(content, LTImage): print("检测到图片") with open(image_path, "wb") as f: im_name = content.name im_with = content.width im_height = content.height im_name = content.imagemask try: save_image(content, num_page, images_folder) print("提取正常") except Exception as e: print("图片提取异常", e) print('对象数量:\n', '页面数:%s\n' % num_page, '图片数:%s\n' % num_image, '曲线数:%s\n' % num_curve, '水平文本框:%s\n' % num_TextBoxHorizontal)
if __name__=='__main__': path = r"Selenium 自动化爬虫.pdf" # 测试提取图片pdf to_path = r"读取pdf文件03.txt" image_path = r"帅哥.png" # 这里只是提取报存一张,如果你是提取很多的话,这个得需要改一下 get_read(path, to_path, image_path)
|