Python模拟登录CSDN

用Scrapy模拟登录CSDN,并进入个人中心抓取数据


scrapy主代码:

import scrapy
from scrapy.http import FormRequest
class Csdn(scrapy.Spider):
name = 'csdn_cookie_login'
@classmethod
def from_crawler(cls, crawler):
return cls(
username = crawler.settings.get('CSDN_USERNAME'),
password = crawler.settings.get('CSDN_PASSWORD')
)
def start_requests(self):
return [
scrapy.Request('https://passport.csdn.net/account/login',
callback=self.parse_login, meta={'cookiejar':1})
]
def parse_login(self, response):
data = {'username': self.username, 'password': self.password}
res = response.xpath('//*[@id="fm1"]/input[@type="hidden"]').extract()
for r in res:
r_list = r.split('"')
data[str(r_list[3])] = r_list[5]
return [FormRequest.from_response(response,
url="https://passport.csdn.net/account/verify",
formdata=data,
meta={'cookiejar':response.meta['cookiejar']},
callback=self.after_post)]
def after_post(self, response):
return [
scrapy.Request("https://www.csdn.net",
callback=self.parse_index, meta={'cookiejar':1})
]
def parse_index(self, response):
cookies = response.request.headers.getlist('Cookie')[0].decode('utf-8')
cookies = cookies.split(';')
cookies_dict = {}
for cookie in cookies:
arr = cookie.split('=')
cookies_dict[arr[0].strip()] = arr[1]
return [
scrapy.Request("http://my.csdn.net/",
callback=self.parse_blog,
cookies=cookies_dict,
)
]
def parse_blog(self, response):
self.logger.warning("parse_blog")
unicode_body = response.body_as_unicode()
if self.username in unicode_body:
self.logger.warning("登录成功")
else:
self.logger.warning("登录失败!")
res = response.xpath('//a[@href="/my/follow"][1]/text()').extract_first()
self.logger.warning('我关注的人数:' + res)

经不完整验证,目前可以登录的页面有:

跑scrapy:

scrapy crawl csdn_cookie_login -L WARNING

结果:

2017-12-06 22:00:24 [csdn_cookie_login] WARNING: parse_blog
2017-12-06 22:00:24 [csdn_cookie_login] WARNING: 登录成功
2017-12-06 22:00:24 [csdn_cookie_login] WARNING: 我关注的人数:3

模拟登录CSDN成功。

模拟登录需要配合调试工具查看网页源码等,反复调试,一般并不会顺利。此外一些异步加载和判断的并不能用这种方式模拟登录,需要用别的方式。