import json import random import time import sys import os import requests import numpy as np import math from lxml import etree from pyquery import PyQuery as pq from selenium import webdriver from selenium.webdriver import ChromeOptions from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver import ChromeOptions, ActionChains from tkinter.filedialog import askdirectory from tqdm import trange
# 收藏宝贝 传入爬几页 默认三页 https://shoucang.taobao.com/nodejs/item_collect_chunk.htm?ifAllTag=0&tab=0&tagId=&categoryCount=0&type=0&tagName=&categoryName=&needNav=false&startRow=60 def get_choucang_item(self, page=3): url = 'https://shoucang.taobao.com/nodejs/item_collect_chunk.htm?ifAllTag=0&tab=0&tagId=&categoryCount=0&type=0&tagName=&categoryName=&needNav=false&startRow={}' pn = 0 json_list = [] for i in trange(page): self.driver.get(url.format(pn)) pn += 30 html_str = self.driver.page_source if html_str == '': break if'登录'in html_str: raise Exception('登录') obj_list = etree.HTML(html_str).xpath('//li') for obj in obj_list: item = {} item['title'] = ''.join([i.strip() for i in obj.xpath('./div[@class="img-item-title"]//text()')]) item['url'] = ''.join([i.strip() for i in obj.xpath('./div[@class="img-item-title"]/a/@href')]) item['price'] = ''.join([i.strip() for i in obj.xpath('./div[@class="price-container"]//text()')]) if item['price'] == '': item['price'] = '失效' json_list.append(item) # file_path = os.path.join(os.path.dirname(__file__) + '/shoucang_item.json') json_str = json.dumps(json_list) with open(self.path + os.sep + 'shoucang_item.json', 'w') as f: f.write(json_str)
# 浏览足迹 传入爬几页 默认三页 https://shoucang.taobao.com/nodejs/item_collect_chunk.htm?ifAllTag=0&tab=0&tagId=&categoryCount=0&type=0&tagName=&categoryName=&needNav=false&startRow=60 def get_footmark_item(self, page=3): url = 'https://www.taobao.com/markets/footmark/tbfoot' self.driver.get(url) pn = 0 item_num = 0 json_list = [] for i in trange(page): html_str = self.driver.page_source obj_list = etree.HTML(html_str).xpath('//div[@class="item-list J_redsList"]/div')[item_num:] for obj in obj_list:
item_num += 1 item = {} item['date'] = ''.join([i.strip() for i in obj.xpath('./@data-date')]) item['url'] = ''.join([i.strip() for i in obj.xpath('./a/@href')]) item['name'] = ''.join([i.strip() for i in obj.xpath('.//div[@class="title"]//text()')]) item['price'] = ''.join([i.strip() for i in obj.xpath('.//div[@class="price-box"]//text()')]) json_list.append(item) self.driver.execute_script('window.scrollTo(0,1000000)') # file_path = os.path.join(os.path.dirname(__file__) + '/footmark_item.json') json_str = json.dumps(json_list) with open(self.path + os.sep + 'footmark_item.json', 'w') as f: f.write(json_str)