可见即可爬,selenium 爬取js手机网站

前言

最近发现上次爬的得间小说网站突然更新了,PC端网页只能看前几章,而且网站还用js加载了。

手机访问发现还可以免费观看,网址变成m.idejian.com,但是PC端访问会自动跳转

思路

js加载的网站一般request是爬取不到的,就要使用Selenium爬取,Selenium 是一个自动化测试工具,利用它可以驱动浏览器执行特定的动作,如点击、下拉等操作,同时还可以获取浏览器当前呈现的页面源代码,做到可见即可爬。对于一些使用 JavaScript 动态渲染的页面来说,此种抓取方式非常有效

同时谷歌浏览器F12开发者是可以模拟手机设备访问的,这样Selenium应该也可以,就去找了一下代码

代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait


class CrackSlider():
def __init__(self):
super(CrackSlider, self).__init__()
# 小说ID
self.book_id = '12525742'
# 章节ID
self.zj_id = 1
# 初始url
self.url = 'https://m.idejian.com/book/' + self.book_id +'/' + str(self.zj_id) + '.html'
# 小说名字
self.book_names = ''
# 此步骤很重要,设置为开发者模式,防止被各大网站识别出来
self.options = webdriver.ChromeOptions()
self.options.add_experimental_option('excludeSwitches', ['enable-automation'])
self.options.add_experimental_option('useAutomationExtension', False)
self.options.add_argument('--headless') # 设置无头模式
# 模拟手机
self.mobileEmulation = {'deviceName': 'iPhone X'}
self.options.add_experimental_option('mobileEmulation', self.mobileEmulation)

self.driver = webdriver.Chrome(options=self.options)
self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
'source': 'Object.defineProperty(navigator, "webdriver", {get: () => undefined})'
})
self.driver.set_window_size(1366, 768)
# self.driver.maximize_window()
# 等待加载元素,最多10s
self.wait = WebDriverWait(self.driver, 10)

def open(self):
start_url = 'https://m.idejian.com/book/' + self.book_id +'/1.html'
self.driver.get(start_url)
# 等待网页加载
self.wait.until(
EC.presence_of_all_elements_located(
(By.CLASS_NAME, 'text-title-1')
)
)
self.driver.find_element_by_class_name('tips_menu_it').click()

# 获取小说名字
def book_name(self):
book_names = self.driver.find_element_by_class_name('biaoti').text
book_names = book_names.split('\n')
print(book_names[0])
self.book_names = book_names[0]

def get_book(self, zj_url, zj_id):
txt_s = ''
self.driver.get(zj_url)
# 等待网页加载
try:
self.wait.until(
EC.presence_of_all_elements_located(
(By.CLASS_NAME, 'text-title-1')
)
)
except:
# 超过最后一章为空白,肯定报错...
self.close()
# 开始提取txt
# 章节名字
zj_name = self.driver.find_element_by_class_name('text-title-1').text
print(zj_name)
txt_s = txt_s + zj_name + '\n'
# txt
if zj_id == 1:
ps = self.driver.find_elements_by_class_name('h5_mainbody')[1].find_elements_by_tag_name('p')
else:
ps = self.driver.find_element_by_class_name('h5_mainbody').find_elements_by_tag_name('p')
if ps:
for p in ps:
txt_s = txt_s + p.text + '\n'
print(txt_s)
# 保存
self.writer(txt_s)

# 下一页
zj_id = zj_id + 1
next_url = 'https://m.idejian.com/book/' + self.book_id +'/' + str(zj_id) + '.html'
# print(next_url)
self.get_book(next_url, zj_id)

# 保存
def writer(self, txt):
path = self.book_names + '.txt'
with open(path, 'a', encoding='utf-8') as f:
f.write(txt)
f.write('\n\n')

def close(self):
print("下载完成")
self.driver.close()


if __name__ == '__main__':
c = CrackSlider()
c.open()
c.book_name()
c.get_book(c.url, c.zj_id)

嘿嘿

评论系统不开,代码写的真看不下去可以联系方式来找我喷…


本博客所有文章除特别声明外,均采用 CC BY-SA 4.0 协议 ,转载请注明出处!