1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 import timeimport requestsfrom lxml import etreefrom multiprocessing.dummy import Pool as ThreadPoolfrom urllib.parse import urljoinfrom queue import PriorityQueue queue = PriorityQueue() start_url = 'http://www.ybiquge.com/95_95524/' headers = { 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' , 'Accept-Language' : 'zh-CN,zh;q=0.9,en;q=0.8' , 'user-agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36' }def get_url (url ): print ("访问url:{}" .format (url)) try : res = requests.get(url, headers) return res.text except : return False def get_page (): page_html = get_url(start_url) if page_html: page_tree = etree.HTML(page_html) book_name = page_tree.xpath('//div[@id="info"]/h1/text()' )[0 ] print (book_name) page_url_lists = page_tree.xpath('//div[@id="list"]/dl/dd/a/@href' ) return book_name, page_url_lists else : return False def get_text (url ): url = urljoin(start_url, url) zj_html = get_url(url) if zj_html: page_tree = etree.HTML(zj_html) zj_name = page_tree.xpath('//div[@class="bookname"]/h1/text()' )[0 ] print (zj_name) zj_text_list = page_tree.xpath('//div[@id="content"]/text()' ) zj_text = '' for text in zj_text_list: zj_text = zj_text + text + '\n' print ('内容:' , zj_text[0 :15 ], '...' ) zj_id = url.split('/' )[-1 ].replace('.html' , '' ) queue.put((int (zj_id), zj_name+'\n' +zj_text+'\n' )) time.sleep(1 )def down_text (name, texts ): path = name + '.txt' with open (path, 'a' , encoding='utf-8' ) as a: a.writelines(texts)if __name__ == '__main__' : start_time = time.time() xs_book_name, zj_lists = get_page() pool = ThreadPool(4 ) pool.map (get_text, zj_lists) pool.close() pool.join() print ("正在保存小说,请稍后..." ) content = "" while not queue.empty(): next_obj = queue.get() content += next_obj[1 ] down_text(xs_book_name, content) print (xs_book_name, "下载完成!" ) print ("共用时:" , time.time()-start_time)
Copy