1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105
| import time import requests from lxml import etree from multiprocessing.dummy import Pool as ThreadPool from urllib.parse import urljoin from queue import PriorityQueue
queue = PriorityQueue()
start_url = 'http://www.ybiquge.com/95_95524/' headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36' }
def get_url(url): print("访问url:{}".format(url)) try: res = requests.get(url, headers) return res.text except: return False
def get_page(): page_html = get_url(start_url) if page_html: page_tree = etree.HTML(page_html) book_name = page_tree.xpath('//div[@id="info"]/h1/text()')[0] print(book_name) page_url_lists = page_tree.xpath('//div[@id="list"]/dl/dd/a/@href') return book_name, page_url_lists else: return False
def get_text(url): url = urljoin(start_url, url) zj_html = get_url(url) if zj_html: page_tree = etree.HTML(zj_html) zj_name = page_tree.xpath('//div[@class="bookname"]/h1/text()')[0] print(zj_name) zj_text_list = page_tree.xpath('//div[@id="content"]/text()') zj_text = '' for text in zj_text_list: zj_text = zj_text + text + '\n' print('内容:', zj_text[0:15], '...') zj_id = url.split('/')[-1].replace('.html', '') queue.put((int(zj_id), zj_name+'\n'+zj_text+'\n')) time.sleep(1)
def down_text(name, texts): path = name + '.txt' with open(path, 'a', encoding='utf-8') as a: a.writelines(texts)
if __name__ == '__main__': start_time = time.time() xs_book_name, zj_lists = get_page() pool = ThreadPool(4) pool.map(get_text, zj_lists) pool.close() pool.join() print("正在保存小说,请稍后...") content = "" while not queue.empty(): next_obj = queue.get() content += next_obj[1] down_text(xs_book_name, content) print(xs_book_name, "下载完成!") print("共用时:", time.time()-start_time)
|