Py学习  »  分享发现

使用python抓取新浪微博的粉丝信息

djangolover • 7 年前 • 3800 次点击  

环境:

系统:windows 7

版本:python 3.3

IDE: PyCharm 4.0.4

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import time
import base64
import rsa
import binascii
import requests
import re
import random
try:
    import cookielib
except:
    import http.cookiejar as cookielib

try:
    from PIL import Image
except:
    pass
try:
    from urllib.parse import quote_plus
except:
    from urllib import quote_plus

'''
如果没有开启登录保护,不用输入验证码就可以登录
如果开启登录保护,需要输入验证码

'''


# 构造 Request headers
agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0'
headers = {
    # "Host": "www.weibo.com",
    'User-Agent': agent
}

session = requests.session()
session.cookies = cookielib.LWPCookieJar(filename='cookies')
try:
    session.cookies.load(ignore_discard=True)
except:
    print("Cookie 未能加载")


# 访问 初始页面带上 cookie
index_url = "http://weibo.com/login.php"
try:
    session.get(index_url, headers=headers, timeout=2)
except:
    session.get(index_url, headers=headers)
try:
    input = raw_input
except:
    pass


def get_su(username):
    """
    对 email 地址和手机号码 先 javascript 中 encodeURIComponent
    对应 Python 3 中的是 urllib.parse.quote_plus
    然后在 base64 加密后decode
    """
    username_quote = quote_plus(username)
    username_base64 = base64.b64encode(username_quote.encode("utf-8"))
    return username_base64.decode("utf-8")


# 预登陆获得 servertime, nonce, pubkey, rsakv
def get_server_data(su):
    pre_url = "http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su="
    pre_url = pre_url + su + "&rsakt=mod&checkpin=1&client=ssologin.js(v1.4.18)&_="
    pre_url = pre_url + str(int(time.time() * 1000))
    pre_data_res = session.get(pre_url, headers=headers)

    sever_data = eval(pre_data_res.content.decode("utf-8").replace("sinaSSOController.preloginCallBack", ''))

    return sever_data


# print(sever_data)


def get_password(password, servertime, nonce, pubkey):
    rsaPublickey = int(pubkey, 16)
    key = rsa.PublicKey(rsaPublickey, 65537)  # 创建公钥
    message = str(servertime) + '\t'


    
 + str(nonce) + '\n' + str(password)  # 拼接明文js加密文件中得到
    message = message.encode("utf-8")
    passwd = rsa.encrypt(message, key)  # 加密
    passwd = binascii.b2a_hex(passwd)  # 将加密信息转换为16进制。
    return passwd


def get_cha(pcid):
    cha_url = "http://login.sina.com.cn/cgi/pin.php?r="
    cha_url = cha_url + str(int(random.random() * 100000000)) + "&s=0&p="
    cha_url = cha_url + pcid
    cha_page = session.get(cha_url, headers=headers)
    with open("cha.jpg", 'wb') as f:
        f.write(cha_page.content)
        f.close()
    try:
        im = Image.open("cha.jpg")
        im.show()
        im.close()
    except:
        print(u"请到当前目录下,找到验证码后输入")


def login(username, password):
    # su 是加密后的用户名
    su = get_su(username)
    sever_data = get_server_data(su)
    servertime = sever_data["servertime"]
    nonce = sever_data['nonce']
    rsakv = sever_data["rsakv"]
    pubkey = sever_data["pubkey"]
    showpin = sever_data["showpin"]
    password_secret = get_password(password, servertime, nonce, pubkey)

    postdata = {
        'entry': 'weibo',
        'gateway': '1',
        'from': '',
        'savestate': '7',
        'useticket': '1',
        'pagerefer': "http://login.sina.com.cn/sso/logout.php?entry=miniblog&r=http%3A%2F%2Fweibo.com%2Flogout.php%3Fbackurl",
        'vsnf': '1',
        'su': su,
        'service': 'miniblog',
        'servertime': servertime,
        'nonce': nonce,
        'pwencode': 'rsa2',
        'rsakv': rsakv,
        'sp': password_secret,
        'sr': '1366*768',
        'encoding': 'UTF-8',
        'prelt': '115',
        'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack',
        'returntype': 'META'
        }
    login_url = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)'
    if showpin == 0:
        login_page = session.post(login_url, data=postdata, headers=headers)
    else:
        pcid = sever_data["pcid"]
        get_cha(pcid)
        postdata['door'] = input(u"请输入验证码")
        login_page = session.post(


    
login_url, data=postdata, headers=headers)
    login_loop = (login_page.content.decode("GBK"))
    # print(login_loop)
    pa = r'location\.replace\([\'"]http://blog.csdn.net/dai_jing/article/details/(.*?)[\'"]\)'
    loop_url = re.findall(pa, login_loop)[0]
    # print(loop_url)
    # 此出还可以加上一个是否登录成功的判断,下次改进的时候写上
    login_index = session.get(loop_url, headers=headers)
    uuid = login_index.text
    uuid_pa = r'"uniqueid":"http://blog.csdn.net/dai_jing/article/details/(.*?)"'
    uuid_res = re.findall(uuid_pa, uuid, re.S)[0]
    web_weibo_url = "http://weibo.com/%s/profile?topnav=1&wvr=6&is_all=1" % uuid_res
    weibo_page = session.get(web_weibo_url, headers=headers)
    weibo_pa = r'
上面引用的lib3是我自己写的一个http接口



fileWriter.py

__author__ = 'zhengjinwei'
#coding=utf-8
import os



class FileWriter:
    def __init__(self,fileDir,fileName,format):
        self.mkDir(fileDir)
        self.f = open(fileDir+u"/"+fileName,format)

    def mkDir(self,path):
        isExists = os.path.exists(path)
        if not isExists:
            os.makedirs(path)
    def write(self,contents):
        return self.f.write(contents)

    def close(self):
        self.f.close()
spider.py
#coding:utf-8
__author__ = 'zhengjinwei'

import  re
import requests

try:
    import cookielib
except:
    import http.cookiejar as cookielib


class Spider:
    def __init__(self,userAgent):

        self.user_agent = userAgent
        self.headers = {
            'User-Agent' : self.user_agent
        }
    def getHttp(self,url,param=None):
        return requests.get(url,param,headers=self.headers)

    def postHttp(self,url,postData=None):
        return requests.post(url, data=postData,headers=self.headers)

    def sessionPostHttp(self,url,param=None):
        session = requests.session()
        session.cookies = cookielib.LWPCookieJar(filename='cookies')
        try:
            session.cookies.load(ignore_discard=True)
            print(url,


    
param)
            return session.post(url,data=param,json=None,headers=self.headers)
        except:
            print("Cookie 未能加载")
            return None

    def sessionGetHttp(self,url,param=None):
        session = requests.session()
        session.cookies = cookielib.LWPCookieJar(filename='cookies')
        try:
            session.cookies.load(ignore_discard=True)
            return session.get(url, headers=self.headers, allow_redirects=False)
        except:
            print("Cookie 未能加载")
            return None
    def parseReg(self,content,strPattern,count):
        pattern = re.compile(strPattern,re.S)
        items = re.findall(pattern,content)

        contents=[]

        for item in items:
            temArr = []
            for i in range(0,count):
                temArr.append(item[i])
            contents.append(temArr)

        return contents

    def getContents(self,url,strPattern,count,method="get",param=None):
        page = ""
        if method == "get":
            page = self.getHttp(url,param).text
        else:
            page = self.postHttp(url,param).text

        pattern = re.compile(strPattern,re.S)
        items = re.findall(pattern,page)
        contents=[]

        for item in items:
            temArr = []
            for i in range(0,count):
                temArr.append(item[i])
            contents.append(temArr)

        return contents



# demo = Spider("Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko")
#
# t = demo.getHttp("https://www.baidu.com/index.php?tn=02049043_23_pg")

效果图:

注意事项:

1,新浪微博本身有查看限制,只能查看到前5页的数据

2,爬虫工具可以使用火狐浏览器,装上fireDebug 插件用于拦截消息,获取cookie和其他参数

Python社区是高质量的Python/Django开发社区
本文地址:http://www.python88.com/topic/1669
 
3800 次点击  
文章 [ 1 ]  |  最新文章 7 年前
Py站长
Reply   •   1 楼
Py站长    7 年前

不错,支持~