1. config
import pymongo
import time
import datetime
from pyquery import PyQuery as pq
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from CommentSpider.Config import *
client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]
browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
browser.set_window_size(1400, 900)
wait = WebDriverWait(browser, 15)
now = datetime.date.today()
one_day = datetime.timedelta(days=1)
today = now.strftime('%m-%d')
yesterday = (now - one_day).strftime('%m-%d')
def save_to_mongo(result):
try:
if db[MONGO_TABLE].find(result).limit(1).count() == 0:
db[MONGO_TABLE].insert(result)
else:
print('comment duplicate')
except Exception:
print('mongodb error')
def get_comments(page_num):
if page_num > CIRCLE:
return
print('page_num:', page_num)
html = browser.page_source
doc = pq(html)
for i in range(5 + page_num * 20, 24 + page_num * 20):
try:
item = doc.find('#qitancommonarea > div:nth-child(2) > div > div > div > '
'div:nth-child(2) > div > div:nth-child({})'.format(i))
except Exception:
return
c_time = item.find('.csPpFeed_time').text()
if '昨日' in c_time:
c_time = yesterday
elif '前' in c_time:
c_time = today
else:
c_time = c_time
comment = {
'time'
: c_time,
'name': item.find('.csPpFeed_master').text(),
'img_url': item.find('.csPpFeed_section > .section_hd > a > img').attr('data-lazy'),
'user_url': item.find('.csPpFeed_master').attr('href'),
'ft_love': item.find('.csPpFeed_ftLove > .ftNums').attr('data-paopao-agreecnt'),
'ft_comment': item.find('.csPpFeed_ftComment > .ftNums').attr('data-paopao-commentcnt'),
'comment': item.find('.csPpFeed_des > span').text()
}
save_to_mongo(result=comment)
more = wait.until(
EC.element_to_be_clickable(
(By.CSS_SELECTOR, '#qitancommonarea > div:nth-child(2) > div > div > div > div:nth-child(2)'
'> div > div.csPpFeed_list > div > div > a')
)
)
more.click()
sleep()
page_num += 1
wait.until(
EC.text_to_be_present_in_element(
(By.CSS_SELECTOR, '#qitancommonarea > div:nth-child(2) > div > div > div > div:nth-child(2)'
'> div > div.csPpFeed_list > div > div > a'), '加载更多'
)
)
get_comments(page_num=page_num)
def start(url):
print('url:', url)
browser.get(url=url)
browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
sleep()
wait.until(
EC.presence_of_element_located(
(By.CSS_SELECTOR, '#qitancommonarea > div:nth-child(2) > div > div > '
'div > div:nth-child(2) > div > div:nth-child({})'.format(24))
)
)
get_comments(page_num=0)
def sleep():
time.sleep(3)
def stop_vedio():
switch = wait.until(
EC.element_to_be_clickable(
(By.CSS_SELECTOR,
'#flashbox > div.pw-video > div:nth-child(5) > div.bottom-public > a.bottom-public_play > i')
)
)
switch.click()
if __name__ == '__main__':
start(url='http://www.iqiyi.com/v_19rr6z41d8.html')