import os,random,time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.desired_capabilities
import DesiredCapabilities
from lxml import etree
启动服务
dcap = dict(DesiredCapabilities.PHANTOMJS)
#这里也是伪装一下UA:
dcap["phantomjs.page.settings.userAgent"] = ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:25.0) Gecko/20100101 Firefox/25.0")
#启动服务(python里面的selenium内置有selenium服务器,需要本地启动)
driver = webdriver.PhantomJS(desired_capabilities=dcap)
构建抓取函数
def getlaogou(driver,url):
#初始化一个长度为0的空字典!以备之后收集数据
myresult = {
"position_name":[],
"position_company":[],
"position_salary":[],
"position_link":[],
"position_exprience":[],
"position_industry":[],
"position_environment":[]
};
#导航到目标网址
driver.get(url)
#计时器初始化
i =0
while True:
#计时器累计计时:
i+=1
#获取当前页面DOM
pagecontent = driver.page_source
#解析HTML文档
result = etree.HTML(pagecontent)
#使用字典内单个list的extend方法累计收集数据
myresult["position_name"].extend(result.xpath('//ul[@class="item_con_list"]/li/@data-positionname'))
myresult["position_company"].extend(result.xpath('//ul[@class="item_con_list"]/li/@data-company'))
myresult["position_salary"].extend(result.xpath('//ul[@class="item_con_list"]/li/@data-salary'))
myresult["position_link"].extend(result.xpath('//div[@class="p_top"]/a/@href'))
myresult["position_exprience"].extend([ text.xpath('string(.)').strip() for text in result.xpath('//div[@class="p_bot"]/div[@class="li_b_l"]')])
myresult["position_industry"].extend([ text.strip() for text in result.xpath('//div[@class="industry"]/text()')])
myresult["position_environment"].extend(result.xpath('//div[@class="li_b_r"]/text()'))
#单次循环任务休眠
time.sleep(random.choice(range(3)))
#判断页面是否到尾部
if result.xpath('//div[@class="page-number"]/span[1]/text()')[0] != '30':
#如果未到达页面尾部,则点击下一页:
driver.find_element_by_xpath('//div[@class="pager_container"]/a[last()]').click()
#同时打印当前任务 状态!
print("第【{}】页抓取成功!".format(i))
else:
#如果所有页面到达尾部,则跳出循环!
break
#打印全局任务状态
print("everything is OK")
#退出并关闭selenium服务!
driver.quit()
#返回数据
return pd.DataFrame(myresult)