# 1、分词 jieba_name = [] for i in range(len(abstract_list)): # seg_list只是一个generator生成器: seg_list = jieba.cut(str(abstract_list[i]).strip(), cut_all=False) # 对list(seg_list)中的每个元素进行追加 for each in list(seg_list): jieba_name.append(each)
# 2、去停用词 # 创建停用词list defstopwordslist(filepath): stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()] return stopwords
stopword_list = [] for word in jieba_name: # jieba_name中的word不在停用词表中,且不是\t制表符 if word notin stopwords: if word != "\t"and word != " "and word != "nan": stopword_list.append(word)
# 3、统计单词出现个数 dic = {} number = 0
for each in stopword_list: if each in dic.keys(): number += 1 dic[each] = number else: dic[each] = 1# 不存在则结果为1
# 4、字典转成json数据,绘制词云图需要 tuple_list = [] for k,v in dic.items(): tuple_list.append(tuple([k,v]))
for i in range(len(sublistbox)): try: if"均"in sublistbox[i]: # 如果均价存在
person_avg.append(re.findall('¥ (.*?)',sublistbox[i],re.S) [0]) # 将解析出来的第一个字段放入列表中 else: # 否则,不存在的话,添加0 person_avg.append(0) continue# 遇到报错继续往下执行 except: person_avg.append(0)
剩下3个字段处理类似:
address = []
for i in range(len(sublistbox)): try: if"址"in sublistbox[i]: # 关键词 address.append(re.findall('址.*?des_line">(.*?)',sublistbox[i],re.S)[0]) else: address.append("无") continue except: address.append("无")
recommand = []
for i in range(len(sublistbox)): try: if"推荐菜"in
sublistbox[i]: recommand.append(re.findall('推荐菜.*?des_line">(.*?)',sublistbox[i],re.S)[0]) else: recommand.append("无") continue except: recommand.append("无")
comment = []
for i in range(len(sublistbox)): try: if"desbox"in sublistbox[i]: # 关键词 comment.append(re.findall('.*?txt">(.*?)',sublistbox[i],re.S)[0]) else: comment.append("无") continue except: comment.append("无")
4、上面是获取单页数据的解析过程,下面讲解如何获取200页的字段数据:
# 中文名:得分字段类似 cn_title_list = []
for i in range(1,201): url = "https://travel.qunar.com/p-cs300022-changsha-meishi?page={}".format(i) headers = {"user-agent"
: "请求头"} response = requests.get(url=url,headers=headers) result = response.content.decode() cn_title = re.findall('cn_tit">(.*?).*?countbox',result,re.S) for each in cn_title: cn_title_list.append(each)
for i in range(1,201): url = "https://travel.qunar.com/p-cs300022-changsha-meishi?page={}".format(i) headers = {"user-agent": ""} response = requests.get(url=url,headers=headers) result = response.content.decode()