with pdfplumber.open(path + r'\公司年报.PDF') as pdf: for i in range(pdf_reader.getNumPages()): page = pdf.pages[i] print(page.extract_text()) if'战略'in page.extract_text(): pdf_writer.addPage(pdf_reader.getPage(i)) print(i + 1, page.extract_text())
完成识别后让写入器输出为需要的文件名:
with open(path + r'\new_公司年报.pdf', 'wb') as out: pdf_writer.write(out)
至此,我们就完成了包含特定文字内容页面的提取,并整合成一个PDF。所有的页面均包含“战略”二字:
需求一完整代码如下,感兴趣的读者可以自行研究
from PyPDF2 import PdfFileReader, PdfFileWriter import pdfplumber
with pdfplumber.open(path + r'\公司年报.PDF') as pdf: for i in range(pdf_reader.getNumPages()): page = pdf.pages[i] print(page.extract_text()) if'战略'in page.extract_text(): pdf_writer.addPage(pdf_reader.getPage(i)) print(i + 1, page.extract_text())
with open(path + r'\new_公司年报1.pdf', 'wb') as out: pdf_writer.write(out)
3.2 需求二的实现
接下来完成需求二的任务。首先导入需要的库:
from PyPDF2 import PdfFileReader, PdfFileWriter import fitz import re import os
from PyPDF2 import PdfFileReader, PdfFileWriter import fitz import re import os
path = r'C:\xxx'
page_lst = [] checkImg = r"/Subtype(?= */Image)" pdf = fitz.open(path + r'\公司年报.PDF') lenXREF = pdf._getXrefLength() for i in range(lenXREF): text = pdf._getXrefString(i) isImage = re.search(checkImg, text) if isImage: page_lst.append(i)
print(page_lst)
pdf_reader = PdfFileReader(path + r'\公司年报.PDF') for page in page_lst: pdf_writer = PdfFileWriter() pdf_writer.addPage(pdf_reader.getPage(page)) with open(path + r'\公司年报_{}.pdf'.format(page + 1), 'wb') as out: pdf_writer.write(out)