Py学习  »  Python

CSV文件在python中写入时缺少一些数据

mat • 3 年前 • 1194 次点击  

我正在使用scrapy创建一个python程序,该程序对给定域进行爬行,当它找到pdf时,它将扫描它们以获取信息(pdf的位置、页数、图像计数、字段计数、标记等),并将所有这些内容放入一个CSV文件中。

它可以下载所有的pdf文件,但是当我打开csv文件时,只有一小部分下载的文件在文件中。我不确定我做错了什么。我以为打开文件后可能没有正确关闭,但我不确定这是问题所在。代码如下:

import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http import Request
import urllib.parse as urlparse
import os.path
import validators
import csv
from .. info import isTagged
from .. get_metadata import get_data, count_images
from .. fieldCount import getFieldCount


class PdfspiderSpider(CrawlSpider):
    name = 'pdfspider' 
    allowed_domain = input('Enter the domain name of the website to be crawled (domain of https://google.com is "google"): ')
    allowed_domains = [allowed_domain]
    #need domain to name folder pdfs will be put into
    global domain
    domain = allowed_domains[0]
    global start
    start = input('Enter the url of the page you wish to start the crawl on (include http/https): ')
    start_urls = [start]
    global base_path
    base_path = input('Where do you wish to save the folder containing the pdfs?: ')

    rules = (
        Rule(LinkExtractor(), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        base_url = start        
        for a in response.xpath('//a[@href]/@href'):
            link = a.extract()
            if link.endswith('.pdf'):
                link = urlparse.urljoin(base_url, link)
                yield Request(link, callback=self.save_pdf)

    def create_csv(self):
        header = ['location', 'title', 'author', '# of pages', 'tagged?', 'field count', 'image count']
        filename = base_path + '/' +domain + '/' + domain + '.csv'
        f = open(filename, 'x')
        writer = csv.writer(f)
        writer.writerow(header)
        f.close()


    def save_pdf(self, response):
        url=response.url
        if response.status == 200:
            save_dir = base_path + '/' + domain
            isExist = os.path.exists(save_dir)
            if not isExist:
                # Create a new directory because it does not exist 
                os.makedirs(save_dir)
            csvFile = domain + '.csv'
            csvPath = save_dir + '/' + csvFile
            csvPathExist = os.path.exists(csvPath)
            if not csvPathExist:
                self.create_csv()
            file = response.url.split('/')[-1]
            full_path = os.path.join(save_dir, file)
            with open(full_path, 'wb') as f:
                f.write(response.body)
                
                is_tagged = isTagged(full_path)
                metaData = get_data(full_path)
                fieldCount = getFieldCount(full_path)
                imageCount = count_images(full_path)
                row = [url, metaData[0], metaData[1], metaData[2], is_tagged, fieldCount, imageCount]
                self.add_to_csv(row)
                f.close()
            
        else:
            print(f"Failed to load pdf: {url}")

    def add_to_csv(self,row):
        filename = base_path + '/' +domain + '/' + domain + '.csv'
        f = open(filename, 'a', newline='')
        writer = csv.writer(f)
        writer.writerow(row)
        f.close()
        

Python社区是高质量的Python/Django开发社区
本文地址:http://www.python88.com/topic/137461
 
1194 次点击  
文章 [ 1 ]  |  最新文章 3 年前