selector = html.fromstring(html_data) poets = selector.xpath("/html/body/div[2]/div[1]/div[@class='sons']") for poet in poets: title = ''.join(poet.xpath("div[1]/p[1]/a/b//text()")).strip() print(title)
诗人和朝代被分隔至两行,说明之间存在换行符及空格,可以用包含.strip()的列表推导式去除:
for poet in poets: title = ''.join(poet.xpath("div[1]/p[1]/a/b//text()")).strip() source = ''.join(poet.xpath('div[1]/p[2]//text()')) source = ''.join([i.strip() for i in source]) print(title, source)
最后是对诗句的解析。为了获取关键字真正在的句子,我们要通过句号或者问号将整首诗断开成多个完整句:
for poet in poets: title = ''.join(poet.xpath("div[1]/p[1]/a/b//text()")).strip() source = ''.join(poet.xpath('div[1]/p[2]//text()')) source = ''.join([i.strip() for i in source]) contents = ''.join(poet.xpath('div[1]/div[@class="contson"]//text()')).strip().replace('\n', '。').replace('?', '。').split('。') print(title, source, contents)
对每一首诗逐渐判断是否包含关键字:
for poet in poets: title = ''.join(poet.xpath("div[1]/p[1]/a/b//text()")).strip() source = ''.join(poet.xpath('div[1]/p[2]//text()')) source = ''.join([i.strip() for i in source]) contents = ''.join(poet.xpath('div[1]/div[@class="contson"]//text()')).strip().replace('\n', '。').replace('?', '。').split('。') content_lst = [] for i in contents: if'酒'in i: content = i.strip() + '。' content_lst.append(content) # 有的诗可能有两句都包含关键字,这两句诗就都是需求 ifnot content_lst: # 有可能只有题目中含有关键词,这种诗就跳过 continue for j in list(set(content_lst)): # 有可能有的诗虽然有两句都包含关键字,但这两句是一样的,需要去重
print(j, title, source)
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
defpoet_content(keyword,num,url): html_data = requests.get(url, headers=headers).text selector = html.fromstring(html_data) poets = selector.xpath("/html/body/div[2]/div[1]/div[@class='sons']") for poet in poets: title = ''.join(poet.xpath("div[1]/p[1]/a/b//text()")).strip() source = ''.join(poet.xpath('div[1]/p[2]//text()')) source = ''.join([i.strip() for i in source]) contents = ''.join(poet.xpath('div[1]/div[@class="contson"]//text()')).strip().replace('\n', '。').replace('?','。').split('。') content_lst = [] for i in contents: if keyword in i: content = i.strip() + '。' content_lst.append(content) ifnot content_lst: continue for j in list(set(content_lst)): print(num, j) print(f'<{title}>', source) print('') num += 1 return num
if __name__ == '__main__': keyword = input('> 请输入关键词: ') print('') num = 1 for i in range(1, 3): url = f'https://so.gushiwen.org/search.aspx?page={i}&value={keyword}' num = poet_content(keyword, num, url)