前言 最近发现上次爬的得间小说网站突然更新了,PC端网页只能看前几章,而且网站还用js加载了。
手机访问发现还可以免费观看,网址变成m.idejian.com,但是PC端访问会自动跳转
思路 js加载的网站一般request是爬取不到的,就要使用Selenium爬取,Selenium 是一个自动化测试工具,利用它可以驱动浏览器执行特定的动作,如点击、下拉等操作,同时还可以获取浏览器当前呈现的页面源代码,做到可见即可爬。对于一些使用 JavaScript 动态渲染的页面来说,此种抓取方式非常有效
同时谷歌浏览器F12开发者是可以模拟手机设备访问的,这样Selenium应该也可以,就去找了一下代码
代码 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 from selenium import webdriverfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support import expected_conditions as ECfrom selenium.webdriver.support.wait import WebDriverWaitclass CrackSlider (): def __init__ (self ): super (CrackSlider, self).__init__() self.book_id = '12525742' self.zj_id = 1 self.url = 'https://m.idejian.com/book/' + self.book_id +'/' + str (self.zj_id) + '.html' self.book_names = '' self.options = webdriver.ChromeOptions() self.options.add_experimental_option('excludeSwitches' , ['enable-automation' ]) self.options.add_experimental_option('useAutomationExtension' , False ) self.options.add_argument('--headless' ) self.mobileEmulation = {'deviceName' : 'iPhone X' } self.options.add_experimental_option('mobileEmulation' , self.mobileEmulation) self.driver = webdriver.Chrome(options=self.options) self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument' , { 'source' : 'Object.defineProperty(navigator, "webdriver", {get: () => undefined})' }) self.driver.set_window_size(1366 , 768 ) self.wait = WebDriverWait(self.driver, 10 ) def open (self ): start_url = 'https://m.idejian.com/book/' + self.book_id +'/1.html' self.driver.get(start_url) self.wait.until( EC.presence_of_all_elements_located( (By.CLASS_NAME, 'text-title-1' ) ) ) self.driver.find_element_by_class_name('tips_menu_it' ).click() def book_name (self ): book_names = self.driver.find_element_by_class_name('biaoti' ).text book_names = book_names.split('\n' ) print (book_names[0 ]) self.book_names = book_names[0 ] def get_book (self, zj_url, zj_id ): txt_s = '' self.driver.get(zj_url) try : self.wait.until( EC.presence_of_all_elements_located( (By.CLASS_NAME, 'text-title-1' ) ) ) except : self.close() zj_name = self.driver.find_element_by_class_name('text-title-1' ).text print (zj_name) txt_s = txt_s + zj_name + '\n' if zj_id == 1 : ps = self.driver.find_elements_by_class_name('h5_mainbody' )[1 ].find_elements_by_tag_name('p' ) else : ps = self.driver.find_element_by_class_name('h5_mainbody' ).find_elements_by_tag_name('p' ) if ps: for p in ps: txt_s = txt_s + p.text + '\n' print (txt_s) self.writer(txt_s) zj_id = zj_id + 1 next_url = 'https://m.idejian.com/book/' + self.book_id +'/' + str (zj_id) + '.html' self.get_book(next_url, zj_id) def writer (self, txt ): path = self.book_names + '.txt' with open (path, 'a' , encoding='utf-8' ) as f: f.write(txt) f.write('\n\n' ) def close (self ): print ("下载完成" ) self.driver.close()if __name__ == '__main__' : c = CrackSlider() c.open () c.book_name() c.get_book(c.url, c.zj_id)
嘿嘿 评论系统不开,代码写的真看不下去可以联系方式来找我喷…