python多线程下载小说

前言

做为一个宅男，自然是一定会看小说的，现在好看的小说都是需要付费的，但是白嫖的一定更香。

分享一个我经常看的小说网站得间，基本掌阅的小说都有。

自己一般是在APP里听小说，浏览器里就不行，就把他爬下来本地阅读了

思路

一开始就是用request方法下载，发现下载速度实在是太慢了，就加上了多线程

感谢桑梓南大佬提供的多线程方法

代码实现

import threading
import time
from bs4 import BeautifulSoup
import codecs
import requests

# 开始时间
start_time = time.time()


# 创建线程类
class myTread(threading.Thread):
    def __init__(self, threadID, name, st):
        threading.Thread.__init__(self)
        self.threadID = threadID
        self.name = name
        self.st = st

    def run(self):
        print("stat ", str(self.name))
        threadget(self.st)
        print("end ", str(self.name))


txtcontent = {}  # 存储小说所有内容
server_url = 'https://www.idejian.com'
headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
    'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36'
}


# get
def gethtml(url):
    html = requests.get(url=url, headers=headers).content.decode('UTF-8', 'ignore')
    return html


txt_name = []  # 存放小说名字
chaptername = []  # 存放小说章节名字
chapteraddress = []  # 存放小说章节地址


# 获取小说名字、所有章节名字和地址
def getchapter(html):
    soup = BeautifulSoup(html, 'lxml')
    try:
        name = soup.find('div', class_="detail_bkname").find('a')
        #print(name.string)
        txt_name.append(name.string)
        alist = soup.find('ul', class_="catelog_list").find_all('a')
        #print(alist)
        for list in alist:
            # print(list.string)
            chaptername.append(list.string)
            href = 'https://www.idejian.com' + list['href']
            chapteraddress.append(href)
    except:
        print('未找到章节')
        return False


# 获取章节内容
def getdetail(html):
    soup = BeautifulSoup(html, 'lxml')
    try:
        content = ''
        pstring = soup.find_all('div', class_="h5_mainbody")
        if len(pstring) > 1:
            pstrings = pstring[1].find_all('p')
        else:
            pstrings = pstring[0].find_all('p')
        # print(pstrings)
        for p in pstrings:
            content += p.string
            # content += '\n'
        # print(content)
        return content
    except:
        print("出错")
        return "出错"


# 线程类
def threadget(st):
    max = len(chaptername)
    while st < max:
        url = str(chapteraddress[st])
        html = gethtml(url)
        content = getdetail(html)
        txtcontent[st] = content
        print('下载完毕' + chaptername[st])
        st += thread_count


def getname(name):
    url = 'https://www.idejian.com/search?keyword=' + name
    html = gethtml(url)
    soup = BeautifulSoup(html, 'lxml')
    try:
        namelist = soup.find('ul', class_="rank_ullist").find('div', class_="rank_bkname").find('a')
        print('开始下载小说：',namelist.string)
        return namelist['href']
    except:
        print('未找到该小说')
        return False





txt_id = str(input('请输入你想要下载的小说名字\n'))
book_url = getname(txt_id)
if book_url:
    url = server_url + book_url
    html = gethtml(url)
    getchapter(html)

    thread_list = []
    thread_count = int(input("请输入需要开的线程数\n"))
    for id in range(thread_count):
        thread1 = myTread(id, str(id), id)
        thread_list.append(thread1)

    for t in thread_list:
        t.setDaemon(False)
        t.start()

    for t in thread_list:
        t.join()
    print('\n子线程运行完毕')
    # 章节内容排序
    txtcontent1 = sorted(txtcontent)
    file = codecs.open('./小说/' + txt_name[0] + '.txt', 'w', encoding='utf-8')  # 小说存储位置
    chaptercount = len(chaptername)

    # 写入文件
    for ch in range(chaptercount):
        title = str(chaptername[ch]) + '\n'
        content = str(txtcontent[txtcontent1[ch]])
        file.write(title+content)
    file.close()
    end_time = time.time()
    print("下载完毕，总耗时", int(end_time-start_time), "秒")

python3 小说

本博客所有文章除特别声明外，均采用 CC BY-SA 4.0 协议，转载请注明出处！

用Python试玩mysql 上一篇

PC版微信加密图片解密，用Python实现下一篇