我正在使用scrapy创建一个python程序,该程序对给定域进行爬行,当它找到pdf时,它将扫描它们以获取信息(pdf的位置、页数、图像计数、字段计数、标记等),并将所有这些内容放入一个CSV文件中。
它可以下载所有的pdf文件,但是当我打开csv文件时,只有一小部分下载的文件在文件中。我不确定我做错了什么。我以为打开文件后可能没有正确关闭,但我不确定这是问题所在。代码如下:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http import Request
import urllib.parse as urlparse
import os.path
import validators
import csv
from .. info import isTagged
from .. get_metadata import get_data, count_images
from .. fieldCount import getFieldCount
class PdfspiderSpider(CrawlSpider):
name = 'pdfspider'
allowed_domain = input('Enter the domain name of the website to be crawled (domain of https://google.com is "google"): ')
allowed_domains = [allowed_domain]
#need domain to name folder pdfs will be put into
global domain
domain = allowed_domains[0]
global start
start = input('Enter the url of the page you wish to start the crawl on (include http/https): ')
start_urls = [start]
global base_path
base_path = input('Where do you wish to save the folder containing the pdfs?: ')
rules = (
Rule(LinkExtractor(), callback='parse_item', follow=True),
)
def parse_item(self, response):
base_url = start
for a in response.xpath('//a[@href]/@href'):
link = a.extract()
if link.endswith('.pdf'):
link = urlparse.urljoin(base_url, link)
yield Request(link, callback=self.save_pdf)
def create_csv(self):
header = ['location', 'title', 'author', '# of pages', 'tagged?', 'field count', 'image count']
filename = base_path + '/' +domain + '/' + domain + '.csv'
f = open(filename, 'x')
writer = csv.writer(f)
writer.writerow(header)
f.close()
def save_pdf(self, response):
url=response.url
if response.status == 200:
save_dir = base_path + '/' + domain
isExist = os.path.exists(save_dir)
if not isExist:
# Create a new directory because it does not exist
os.makedirs(save_dir)
csvFile = domain + '.csv'
csvPath = save_dir + '/' + csvFile
csvPathExist = os.path.exists(csvPath)
if not csvPathExist:
self.create_csv()
file = response.url.split('/')[-1]
full_path = os.path.join(save_dir, file)
with open(full_path, 'wb') as f:
f.write(response.body)
is_tagged = isTagged(full_path)
metaData = get_data(full_path)
fieldCount = getFieldCount(full_path)
imageCount = count_images(full_path)
row = [url, metaData[0], metaData[1], metaData[2], is_tagged, fieldCount, imageCount]
self.add_to_csv(row)
f.close()
else:
print(f"Failed to load pdf: {url}")
def add_to_csv(self,row):
filename = base_path + '/' +domain + '/' + domain + '.csv'
f = open(filename, 'a', newline='')
writer = csv.writer(f)
writer.writerow(row)
f.close()