import requestsurl = "https://item.jd.com/497227.html"#空气净化器r = requests.get(url, timeout=30)r.raise_for_status() # throw HTTPError if the status code is not 200r.encoding = r.apparent_encoding # handling encoding issueprint(r.text[:1000])
实例: 抓取亚马逊中国商品页面
使用Requests库抓取此链接中的商品页面:
"https://www.amazon.cn/dp/B005GNM3SS/"
import requestsurl = "https://www.amazon.cn/dp/B005GNM3SS/"#空气净化器# url = "https://www.amazon.cn/gp/product/B01ARKEV1G" # 机器学习西瓜书r = requests.get(url, timeout=30)print(r.status_code)r.encoding = r.apparent_encoding # handling encoding issueprint(r.text)print(r.request.headers) ## Demo: set user-agnet using "headers"import requestsurl = "https://www.amazon.cn/dp/B005GNM3SS/"#空气净化器headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}# headers = {'User-Agent':'Mozilla/5.0'}r = requests.get(url, timeout=30, headers=headers) # add header information for user-agentprint(r.status_code)r.encoding = r.apparent_encoding # handling encoding issueprint(r.text)
实例: 百度搜索关键词提交
百度的关键词接口:"http://www.baidu.com/s?wd=keyword"
## Demo: set keywords to submit using "params"import requests keywords={'wd':'python scrape'}r = requests.get("http://www.baidu.com/s",params=keywords)print(r.status_code)print(r.request.url)print(r.text)
Requests抓取网页的通用代码
## Requests抓取网页的通用代码: 加入异常捕获,超时设定,编码设定,浏览器伪装import requests # define get_html() functiondefget_html(url):try: r = requests.get(url, headers={'User-Agent':'Mozilla/5.0'}, timeout=30) r.raise_for_status() # throw HTTPError if the status code is not 200 r.encoding = r.apparent_encoding # handling encoding issuereturn r.textexcept:return"Error: something is Wrong!" # call get_html() functionurl_bad = "www.baidu.com"print(get_html(url_bad))print()url = "http://www.baidu.com"print(get_html(url))
BeautifulSoup
Parsing Webpages
BeautifulSoup: a powerful library for parsing webpages
还有一些其他的类库,如lxml(事实上,BeautifulSoup支持使用lxml解析器)
另外,我们可以经常使用浏览器来帮助我们理解HTML页面的结构
'Ctrl-Shift I' in Chrome, or 右击 -> view page source
htmlString = "<html><head><title>This is a titletitle>head><body><h2>Testh2><p>Hello world!p>body>html>" from bs4 import BeautifulSoupsoup = BeautifulSoup(htmlString,"html.parser")print(htmlString)print()print(soup.prettify()) # 友好的输出:prettify()方法
from bs4 import BeautifulSoupimport requests url = "http://www.crummy.com/software/BeautifulSoup"r = requests.get(url) ## get BeautifulSoup objectsoup = BeautifulSoup(r.text,"html.parser")print(type(soup))## get attribute value from an element:## find tag: this only returns the first occurrence, not all tags in the stringfirst_tag = soup.find('a')print(first_tag) ## get attribute `href`print(first_tag.get('href'))print(first_tag['href']) ## get textprint(first_tag.string)print(first_tag.text)print(first_tag.get_text())
for i inrange(0,page,1): # 拼接url url =''.join([base_url,str(i),'.html']) html = requests.get(
url, headers=headers).text beautyHtml = BeautifulSoup(html,'lxml')#解析式的一种,错误最少,最全的 #一般来说都会寻找相应的 div_list = beautyHtml.find_all('div', attrs={ 'class':'feeds-item'})
if div_list: for item in div_list: # 取得tag属性的值 href = item.div.h3.a['href'] # 取得tag的值 name = item.div.h3.a.text.encode('utf-8') reg ='201\d年\d+月' regResul = re.findall(reg, name, re.S) iflen(regResul)!=0: print'___Name:', name print'___ADD', href result.append(href) return result ## get all links in the page link_list =[link.get('href')for link in soup.find_all('a')] for link in link_list: if link isnotNoneand link[:4]=='http':# Note: lazy evaluation external_links.append(link) [link for link in link_list if l isnotNoneand l.startswith('http')]
发送给作者