BeautifulSoup 和 json 库在爬虫项目中的应用

重构人人贷爬虫的过程中,主要要爬取的数据是以 json 数据的格式呈现的,要提取的 html 内容如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
<script id="credit-info-data" type="text/x-json">
{
"data": {
"creditInfo": {
"account": "INVALID",
"album": "INVALID",
"borrowStudy": "VALID",
"car": "INVALID",
"child": "INVALID",
"credit": "FAILED",
"creditInfoId": 499250,
"detailInformation": "VALID",
"fieldAudit": "INVALID",
"graduation": "PENDING",
"house": "INVALID",
"identification": "VALID",
"identificationScanning": "VALID",
"incomeDuty": "PENDING",
"kaixin": "INVALID",
"lastUpdateTime": "Aug 1, 2014 12:00:00 AM",
"marriage": "VALID",
"mobile": "VALID",
"mobileAuth": "INVALID",
"mobileReceipt": "INVALID",
"other": "INVALID",
"renren": "INVALID",
"residence": "VALID",
"titles": "INVALID",
"user": 503971,
"version": 24,
"video": "PENDING",
"work": "OVERDUE"
},
"creditPassedTime": {
"creditPassedTimeId": 499214,
"detailInfomation": "Nov 19, 2013 10:57:21 PM",
"identification": "Nov 19, 2013 3:14:27 PM",
"identificationScanning": "Nov 21, 2013 11:36:55 AM",
"lastUpdateTime": "Aug 1, 2014 12:00:00 AM",
"marriage": "Nov 21, 2013 11:37:32 AM",
"mobile": "Nov 19, 2013 3:10:53 PM",
"residence": "Nov 21, 2013 11:37:44 AM",
"user": 503971,
"work": "Nov 21, 2013 11:37:25 AM"
},
"loan": {
"address": "\u5c71\u4e1c",
"allProtected": false,
"allowAccess": true,
"amount": 30000.0,
"amountPerShare": 50.0,
"avatar": "",
"borrowType": "\u8d2d\u8f66\u501f\u6b3e",
"borrowerId": 503971,
"borrowerLevel": "HR",
"currentIsRepaid": false,
"description": "\u672c\u4eba\u662f\u9ad8\u4e2d\u6559\u5e08\uff0c\u5de5\u8d44\u7a33\u5b9a\uff0c\u73b0\u5728\u4e70\u8f66\u5411\u5927\u5bb6\u501f\u6b3e\uff0c\u6bcf\u6708\u53d1\u5de5\u8d44\u6309\u65f6\u5f52\u8fd8\u3002",
"displayLoanType": "XYRZ",
"finishedRatio": 0.0,
"forbidComment": false,
"interest": 22.0,
"interestPerShare": 0.0,
"jobType": "\u5de5\u85aa\u9636\u5c42",
"leftMonths": 24,
"loanId": 123456,
"loanType": "DEBX",
"monthlyMinInterest": "[{\"month\":\"3\",\"minInterest\":\"10\",\"maxInterest\":\"24\",\"mgmtFee\":\"0.3\",\"tradeFee\":\"0\",\"guaranteeFee\":\"5\",\"inRepayPenalFee\":\"1\",\"divideFee\":0.0},{\"month\":\"6\",\"minInterest\":\"11\",\"maxInterest\":\"24\",\"mgmtFee\":\"0.3\",\"tradeFee\":\"0\",\"guaranteeFee\":\"5\",\"inRepayPenalFee\":\"1\",\"divideFee\":0.0},{\"month\":\"9\",\"minInterest\":\"12\",\"maxInterest\":\"24\",\"mgmtFee\":\"0.3\",\"tradeFee\":\"0\",\"guaranteeFee\":\"5\",\"inRepayPenalFee\":\"1\",\"divideFee\":0.0},{\"month\":\"12\",\"minInterest\":\"12\",\"maxInterest\":\"24\",\"mgmtFee\":\"0.3\",\"tradeFee\":\"0\",\"guaranteeFee\":\"5\",\"inRepayPenalFee\":\"1\",\"divideFee\":0.0},{\"month\":\"15\",\"minInterest\":\"13\",\"maxInterest\":\"24\",\"mgmtFee\":\"0.3\",\"tradeFee\":\"0\",\"guaranteeFee\":\"5\",\"inRepayPenalFee\":\"1\",\"divideFee\":0.0},{\"month\":\"18\",\"minInterest\":\"13\",\"maxInterest\":\"24\",\"mgmtFee\":\"0.3\",\"tradeFee\":\"0\",\"guaranteeFee\":\"5\",\"inRepayPenalFee\":\"1\",\"divideFee\":0.0},{\"month\":\"24\",\"minInterest\":\"13\",\"maxInterest\":\"24\",\"mgmtFee\":\"0.3\",\"tradeFee\":\"0\",\"guaranteeFee\":\"5\",\"inRepayPenalFee\":\"1\",\"divideFee\":0.0},{\"month\":\"36\",\"minInterest\":\"13\",\"maxInterest\":\"24\",\"mgmtFee\":\"0.3\",\"tradeFee\":\"0\",\"guaranteeFee\":\"5\",\"inRepayPenalFee\":\"1\",\"divideFee\":0.0}]",
"months": 24,
"nickName": "sdcsqk",
"oldLoan": false,
"openTime": "Nov 19, 2013 9:11:48 PM",
"overDued": false,
"picture": "",
"principal": 0.0,
"productId": 7,
"productName": "HR",
"repaidByGuarantor": false,
"repayType": "MONTH",
"startTime": "Dec 19, 2013 9:11:48 PM",
"status": "FAILED",
"surplusAmount": 30000.0,
"title": "\u9ad8\u4e2d\u6559\u5e08\uff0c\u5de5\u4f5c\u7a33\u5b9a\u6309\u65f6\u5f52\u8fd8!",
"utmSource": "from-website",
"verifyState": "CANCEL"
}
},
"status": 0
}</script>

在之前的版本中,应用了 re 进行简单粗暴的正则匹配,效率较低,因此在重构过程中,将使用 BS4 对这个标签进行提取,之后应用 json 库将 string 转为 dict,便于后面的调用和输出。

下面简单介绍一下应用到的方法:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# ! /usr/bin/env python 
# -*- coding:utf-8 -*-

__author__ = 'Gao Yuhao'

try:
input = raw_input
except:
pass

import requests
from bs4 import BeartifulSoup
import json

# 确定测试爬虫页面
page_index = input('Pls input the page_index you want to try:')
surl = 'http://www.we.com/lend/detailPage.action?loanId=' + page_index

# 使用requests获取网页
req = requests.get(url = surl)
html = req.text.encode('utf-8')

# 使用BS提取内容
soup = BeautifulSoup(html)
res = soup('script',id = 'credit-info-data')[0].text

# 使用json将其转换为dict
res_json = json.loads(res)
print json.dumps(res_json, indent = 4)

BeautifulSoup 和 json 库在爬虫项目中的应用
https://blog.yuhaogao.com/2016/08/22/BeautifulSoup-和-json-库在爬虫项目中的应用/
作者
宇皓
发布于
2016年8月22日
许可协议