# 切换到跳出的小框 driver.switch_to_frame("sp_message_iframe_490357") # 点击接受收集 Cookies driver.find_element_by_xpath(
"//button[@title='YES, I AGREE']").click()
time.sleep(5)
# 将 Cookies 写入文件 orcookies = driver.get_cookies() print(orcookies) cookies = {} for item in orcookies: cookies[item['name']] = item['value'] with open("wsjcookies.txt", "w") as f: f.write(json.dumps(cookies))
import time from lxml import etree import csv import re from tqdm import tqdm import requests import json import pandas as pd import csv import unicodedata from string import punctuation
f = open('wsjdaylist.txt', 'r')
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:88.0) Gecko/20100101 Firefox/88.0', "content-type": "application/json; charset=UTF-8", "Connection": "keep-alive" }
defparsearticle(date, daylink): with open("wsjcookies.txt", "r")as f: cookies = f.read() cookies = json.loads(cookies) session = requests.session() url = "https://www.wsj.com" + daylink data = session.get(url, headers=headers, cookies = cookies)
sector = page.xpath("//h2[@class='WSJTheme--headline--unZqjb45 reset WSJTheme--heading-3--2z_phq5h typography--serif-display--ZXeuhS5E ']/parent::div[1]/parent::div[1]/preceding-sibling::div[1]//text()[1]") module = [unicodedata.normalize('NFD', i).encode('ascii', 'ignore').decode("utf-8").replace("\n"," ").replace('\t',"") for i in sector if len(i)>1]
article = page.xpath("//h2[@class='WSJTheme--headline--unZqjb45 reset WSJTheme--heading-3--2z_phq5h typography--serif-display--ZXeuhS5E ']/a/text()") article = [unicodedata.normalize('NFD', i).encode('ascii', 'ignore').decode("utf-8").replace("\n"," ").replace('\t',"") for i in article]
with open("wsjarticlelist_test.csv",'a') as h: k = csv.writer(h) for i in range(len(href)): k.writerow([article[i], module[i], href[i], date])
没有采用直接获取符合条件 html element 下的所有文字的方法,而是对每个 element 进行遍历进行更加精细的筛选。这样做的速度稍微慢一点,但是基本上能呈现比较好的视觉呈现效果。
3.2 爬取文章代码
import time from lxml import etree import csv import re from tqdm import tqdm import requests import json import pandas as pd import csv import unicodedata from string import punctuation
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:88.0) Gecko/20100101 Firefox/88.0', "content-type": "application/json; charset=UTF-8", "Connection": "keep-alive" }
deftranslist(infolist): out = list(filter(lambda s: s and (type(s) != str or len(s.strip()) > 0), [i.strip() for i in infolist])) return(out)
defparsearticle(title, date, articlelink): with open("wsjcookies.txt", "r")as f: cookies = f.read() cookies = json.loads(cookies) session = requests.session() data = session.get(articlelink, headers=headers, cookies = cookies) time.sleep(1)
page = etree.HTML(data.content)
arcontent = title + '\n\n' + date +'\n\n'
content = page.xpath("//div[@class='article-content ']//p") for element in content: subelement = etree.tostring(element).decode() subpage = etree.HTML(subelement) tree = subpage.xpath('//text()') line = ''.join(translist(tree)).replace('\n','').replace('\t','').replace(' ','').strip()+'\n\n' arcontent += line
return(arcontent)
for row in tqdm(df.iterrows()): title = row[
1][0].replace('/','-') articlelink = row[1][2] date = row[1][3].replace('/','-') arcontent = parsearticle(title, date, articlelink) with open("/Users/mengjiexu/Dropbox/articles/%s_%s.txt"%(date,title),'w') as g: g.write(''.join(arcontent))
3.3 爬取文章样例
4. 翻译
英语阅读速度比较慢的朋友可以调用 百度 API 对文章进行翻译,这样可以一目十行快速提取大量文章信息。为了提高翻译速度,最好整篇文章作为一个文字整体翻译。
4.1 翻译文章代码
import os import requests import random import json from hashlib import md5 from tqdm import tqdm
# Send request r = requests.post(url, params=payload, headers=headers) result = r.json()
# Show response rusult = json.dumps(result, indent=4, ensure_ascii=False)
return(result["trans_result"][0]["dst"])
for file in tqdm(file_list): content =open("/Users/mengjiexu/Dropbox/articles/%s"%file, 'r').read() print(trans(content.strip()))
4.2 翻译文章样例
5. 参考文献
Zhu, Christina. 2019. “Big Data as a Governance Mechanism.” The Review of Financial Studies 32 (5): 2021–61. -PDF-.
Katona, Zsolt, Marcus Painter, Panos N. Patatoukas, and Jean Zeng. 2018. “On the Capital Market Consequences of Alternative Data: Evidence from Outer Space.” SSRN Scholarly Paper ID 3222741. Rochester, NY: Social Science Research Network. -PDF-.
Mukherjee, Abhiroop, George Panayotov, and Janghoon Shon. 2021. “Eye in the Sky: Private Satellites and Government Macro Data.” Journal of Financial Economics, March. -PDF-.