极客时间课程爬虫

介绍

简单分析下极客时间课程的爬虫功能

公共post方法

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
session = requests.session()

headers = {
'Content-Type': "application/json",
'Referer': "https://account.geekbang.org/dashboard/buy",
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0",
'Accept': "*/*",
'Host': "time.geekbang.org",
}


def post(url, data=None, hs=headers, retry=3):
"""
发送请求
:param url: 请求网址
:param data: 表单数据
:param hs: 请求头
:param retry: 失败重试次数
:return: 响应体
"""
payload = json.dumps(data) if data else "{}"
res = session.request("POST", url, data=payload, headers=hs)
if res.status_code != 200:
return ''
return res

登录

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
def login(phone, pwd):
"""
:param phone: 手机号
:param pwd: 密码
:return:
"""
url = 'https://account.geekbang.org/account/ticket/login'

login_hd = {
# 需要替换User-Agent ,否则会被拒绝
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0",
'Content-Type': "application/json",
'Referer': "https://account.geekbang.org",
'Host': "account.geekbang.org",
}
data = {
"country": 86,
"cellphone": phone,
"password": pwd,
"remember": 1,
"platform": 3,
"appid": 1
}

response = post(url, data, hs=login_hd)
res = response.json()
if res['code'] != 0:
msg = res['error']['msg']
print("login failed, error: {}".format(msg))
sys.exit(1)

session.cookies = response.cookies

获取指定课程的所有章节

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
def get_all_articles(product):
"""
获取专栏下的所有文章
:param product: 单个产品(专栏)
:return: 文章id和文章标题的链表
"""
url = 'https://time.geekbang.org/serv/v1/column/articles'
payload = {
"cid": product,
"size": 500,
"prev": 0,
"order": "earliest",
"sample": False
}
response = post(url, payload)
data = response.json(encoding='utf-8')['data']
lst = []
for d in data['list']:
lst.append({'id': d['id'], 'article_title': d['article_title']})
return lst

获取指定章节的内容

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
def get_content(articleid):
"""
获取文章内容
:param articleid: 文章id
:return: 文章内容的html
"""
url = 'https://time.geekbang.org/serv/v1/article'

payload = {
"id": articleid,
"include_neighbors": False,
"is_freelyread": False
}
response = post(url, payload)
data = response.json(encoding='utf-8')['data']
if 'article_content' not in data:
raise Exception("no article content, data is: %s" % json.dumps(data, indent=4))
return '<meta charset="utf-8">%s' % data['article_content']

获取指定课程的所有内容

1
2
3
4
5
6
7
8
9
10
def get_articles(product):
"""
获取指定课程的内容
:param product: 课程id ,见课程的url
:return:
"""
login('123123123123', '1sefw')
lst = get_all_articles(product)
for item in lst:
print(get_content(item['id']))

保存为pdf

输出为html内容,通过 pdfkit 插件转化为pdf文件