a = re.findall(r'"raw_title":"(.*?)"', html)
b = re.findall(r'"view_price":"(.*?)"', html)
c = re.findall(r'"item_loc":"(.*?)"', html)
d = re.findall(r'"view_sales":"(.*?)"', html)
1
2
3
4
1
2
3
4
三:函数填写
这里我写了三个函数,
第一个函数来获取html网页
,代码如下:
defGetHtml(url):
r = requests.get(url,headers =headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r
1
2
3
4
5
1
2
3
4
5
第二个用于获取网页的URL
代码如下:
defGeturls(q, x):
url ="https://s.taobao.com/search?q="+ q +"&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm" \
"=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306 "
urls =[]
urls.append(url)if x ==1:return urls
for i inrange(1, x ):
url ="https://s.taobao.com/search?q="+ q +"&commend=all&ssid=s5-e&search_type=item" \
"&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306" \
"&bcoffset=3&ntoffset=3&p4ppushleft=1%2C48&s="+str(
i *44)
urls
.append(url)return urls
1
2
3
4
5
6
7
8
9
10
11
12
13
14
1
2
3
4
5
6
7
8
9
10
11
12
13
14
第三个用于获取我们需要的商品信息并写入Excel表格
代码如下:
defGetxxintoExcel(html):global count#定义一个全局变量count用于后面excel表的填写
a = re.findall(r'"raw_title":"(.*?)"', html)#(.*?)匹配任意字符
b = re.findall(r'"view_price":"(.*?)"', html)
c = re.findall(r'"item_loc":"(.*?)"', html)
d = re.findall(r'"view_sales":"(.*?)"', html)
x =[]for i inrange(len(a)):try:
x.append((a[i],b[i],c[i],d[i]))#把获取的信息放入新的列表中except IndexError:break
i =0for i inrange(len(x)):
worksheet.write(count + i +1,0, x[i][0])#worksheet.write方法用于写入数据,第一个数字是行位置,第二个数字是列,第三个是写入的数据信息。
worksheet.write(count + i +1,1, x[i][1])
worksheet.write(count + i +1,2, x[i][2])
worksheet.write(count + i +1,3, x[i][3])
count = count +len(x)#下次写入的行数是这次的长度+1returnprint("已完成")
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
四:主函数填写
if __name__ =="__main__":
count =0
headers ={"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36","cookie":""#cookie 是每个人独有的,因为反爬机制的缘故,爬取太快可能到后面要重新刷新一下自己的Cookie。}
q =input("输入货物")
x =int(input("你想爬取几页"))
urls = Geturls(q,x)
workbook = xlsxwriter.Workbook(q+".xlsx")
worksheet = workbook.add_worksheet()
worksheet.set_column('A:A',70)
worksheet.set_column('B:B',20)
worksheet.set_column('C:C',20)
worksheet.set_column('D:D',20)
worksheet.write('A1','名称')
worksheet.write('B1','价格')
worksheet.write('C1','地区')
worksheet.write('D1','付款人数')for url in urls:
html = GetHtml(url)
s = GetxxintoExcel(html.text)
time.sleep(5)
workbook.close()#在程序结束之前不要打开excel,excel表在当前目录下
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
五:完整代码
import re
import requests
import xlsxwriter
import time
defGetxxintoExcel(html):global count
a = re.findall(r'"raw_title":"(.*?)"', html)
b = re.findall(r'"view_price":"(.*?)"', html)
c = re.findall(r'"item_loc":"(.*?)"', html)
d = re.findall(r'"view_sales":"(.*?)"', html)
x =[]for i inrange(len(a)):try:
x.append((a[i],b[i],c[i],d[i]))except IndexError:break
i =0for i inrange(len(x)):
worksheet.write(count + i +1,0, x[i][0])
worksheet.write(count + i +1,1, x[i][1])
worksheet.write(count + i +1,2, x[i][2])
worksheet.write(count + i +1,3, x[i][3])
count = count +len(x)returnprint("已完成")defGeturls(q, x):
url ="https://s.taobao.com/search?q="+ q +"&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm" \
"=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306 "
urls =[]
urls.append(url)if x ==1:return urls
for i inrange(1, x ):
url ="https://s.taobao.com/search?q="+ q +"&commend=all&ssid=s5-e&search_type=item" \
"&sourceId=tb.index&spm=a21bo.2017.201856-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306" \
"&bcoffset=3&ntoffset=3&p4ppushleft=1%2C48&s="+str(
i *44)
urls.append(url)return urls
defGetHtml(url):
r = requests.get(url,headers =headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r
if __name__ =="__main__":
count =0
headers ={"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36","cookie":""}
q =input("输入货物")
x =int(input("你想爬取几页"))
urls = Geturls(q,x)
workbook = xlsxwriter.Workbook(q+".xlsx")
worksheet = workbook.add_worksheet()
worksheet.set_column('A:A',70)
worksheet.
set_column('B:B',20)
worksheet.set_column('C:C',20)
worksheet.set_column('D:D',20)
worksheet.write('A1','名称')
worksheet.write('B1','价格')
worksheet.write('C1','地区')
worksheet.write('D1','付款人数')
xx =[]for url in urls:
html = GetHtml(url)
s = GetxxintoExcel(html.text)
time.sleep(5)
workbook.close()