我打算在文件上生成一个校验和,然后将校验和作为文件本身的元数据。然而,我面临的问题是,元数据放入文件之前和之后生成的校验和是不同的,我意识到这是因为文件中的元数据发生了更改。
这就是我在python 3中对pypdf2所做的:
import os
import sys
import time
import hashlib
from PyPDF2 import PdfFileReader, PdfFileWriter
from PyPDF2.generic import NameObject, createStringObject
1。函数生成文件的md5校验和
def md5_checksum(filePath):
try:
with open(filePath, 'rb') as file:
m = hashlib.md5()
while True:
data = file.read(8192)
if not data:
break
m.update(data)
return m.hexdigest()
except IOError:
print('[ERROR][NOT FOUND]: ' + filePath)
return 'NULL'
finally:
file.close()
2。函数将元数据嵌入到pdf文件中
def embedd_metadata_file(file_in, file_out, metadatas):
with open(file_in, 'rb') as fin:
pdf = PdfFileReader(fin)
writer = PdfFileWriter()
metadata = writer._info.getObject()
info = pdf.documentInfo
for page in range(pdf.getNumPages()):
writer.addPage(pdf.getPage(page))
for key in info:
metadata.update({NameObject(key): createStringObject(info[key])})
for key in metadatas:
metadata.update({NameObject(key): createStringObject(str(metadatas[key]))})
metadata.update({
NameObject('/Pages'): createStringObject(str(pdf.getNumPages()))
})
with open(file_out, 'wb') as fout:
writer.write(fout)
fin.close()
fout.close()
os.unlink(file_in)
os.rename(file_out, file_in)
pdf = PdfFileReader(open(file_in, 'rb'))
return pdf.getDocumentInfo()
三。调用函数
file = 'example.pdf'
file_in = os.path.join(DIR, file)
file_out = os.path.join(DIR, file + '.OUT.pdf')
checksum = md5_checksum(file_in)
metadata = embedd_metadata_file(file_in, file_out, {
'/MD5Checksum': checksum,
'/ISBN': 'xxx-xxxx-xxxxxxx-xxx',
'/eISBN': 'xxx-xxxx-xxxxxxx-xxx,
'/Title': 'Blablablabla',
'/Size': '2.3 MB',
'/Author': 'Some Author',
'/Copyright': 'Blabla Blablabla',
'/Version': '2',
'/Publisher': 'Blablablabla Blabla'
})
问题是,如何将校验和作为元数据文件而不改变文件本身的校验和?任何帮助我都很感激。