yjsx86 大佬有话说 :
清华文泉学堂协程下载
本帖最后由 yjsx86 于 2020-2-2 22:08 编辑
协程下载, 增加下载速度, 试试看
github上那个试用了一下 会出现not found 还是not load 来着导致下载失败
这个会把下载失败的重回队列, 最终会下载成功的(测试的时候有两张图总是下载失败, 程序重试了上百次后成功了!)
import requests
import jwt
import json
import time
import aiohttp
import asyncio
import os
import logging
logging.basicConfig(level=logging.INFO)
class Wqxuetang():
def __init__(self, bookid, max_threads=4):
self.bookid = bookid
self.max_threads = max_threads
self.work_queue = asyncio.Queue()
self.jwt_secret = "g0NnWdSE8qEjdMD8a1aq12qEYphwErKctvfd3IktWHWiOBpVsgkecur38aBRPn2w"
self.session = requests.session()
self.jwtkey = self.get_jwt_key()
self.timeoutlist = []
(self.bookname, self.totalpages) = self.bookinfo()
self.totalpages = int(self.totalpages)
self.creat_and_enter_book_dir()
def creat_and_enter_book_dir(self):
curpath = os.getcwd()
newpath = curpath + os.path.sep + self.bookname
os.mkdir(newpath)
os.chdir(newpath)
def bookinfo(self):
url = f"https://lib-nuanxin.wqxuetang.com/v1/read/initread?bid={self.bookid}"
r = self.session.get(url)
info = json.loads(r.text)
data = info[‘data’]
return data[‘name’], data[‘canreadpages’]
def get_jwt_key(self):
url = "https://lib-nuanxin.wqxuetang.com/v1/read/k?bid=%s" % self.bookid
r = self.session.get(url, timeout=5)
j = json.loads(r.text)
return j[‘data’]
def get_jwt_token(self, page):
cur_time = time.time()
jwttoken = jwt.encode({
"p": page,
"t": int(cur_time)*1000,
"b": str(self.bookid),
"w": 1000,
"k": json.dumps(self.jwtkey),
"iat": int(cur_time)
}, self.jwt_secret, algorithm=’HS256′).decode(‘ascii’)
return jwttoken
async def download_img(self, page, task_id):
token = self.get_jwt_token(page)
url = f"https://lib-nuanxin.wqxuetang.com/page/img/{self.bookid}/{page}?k={token}"
headers = {
‘referer’: f’https://lib-nuanxin.wqxuetang.com/read/pdf/{self.bookid}’,
‘user-agent’: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36’
}
try:
async with aiohttp.ClientSession() as session:
async with session.get(url, headers=headers, timeout=5) as response:
r = await response.read()
except Exception:
logging.warning(f"线程 {task_id} -> 第 {page} 张下载失败, 重回队列!!!")
self.work_queue.put_nowait(page)
else:
with open(f"{self.bookname+str(page)}.png", "wb") as f:
f.write(r)
logging.info(f"线程 {task_id} -> 第 {page} 张下载完成")
async def handle_tasks(self, task_id):
while not self.work_queue.empty():
page = await self.work_queue.get()
await self.download_img(page, task_id)
logging.info(f"线程 {task_id} 结束工作!~")
def main(self):
loop = asyncio.get_event_loop()
tasks =
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
if __name__ == "__main__":
bookid = 3187882
# 参数1:书号, 参数2:线程数
# 默认书名为下载文件夹名
w = Wqxuetang(bookid, 4)
w.main()
haierwx21410 大佬有话说 :
顶一顶,请问楼主,这个下载下来是图片么?还是pdf啊
yjsx86 大佬有话说 :
haierwx21410 大佬有话说 : 2020-2-2 22:10
顶一顶,请问楼主,这个下载下来是图片么?还是pdf啊
下载下来的图片 , 需要pdf的话自己合成吧
jxpal 大佬有话说 :
顶一下,顺便mark
东南西北 大佬有话说 :
马克一下
杜甫 大佬有话说 :
🙂 每年618买的书堆成山了都
vopcloud 大佬有话说 :
import requests
import jwt
import json
import time
import aiohttp
import asyncio
import os
import logging
import sys
logging.basicConfig(level=logging.INFO)
class Wqxuetang():
def __init__(self, bookid, max_threads=4):
self.bookid = bookid
self.max_threads = max_threads
self.work_queue = asyncio.Queue()
self.jwt_secret = "g0NnWdSE8qEjdMD8a1aq12qEYphwErKctvfd3IktWHWiOBpVsgkecur38aBRPn2w"
self.session = requests.session()
self.jwtkey = self.get_jwt_key()
self.timeoutlist = []
(self.bookname, self.totalpages) = self.bookinfo()
self.totalpages = int(self.totalpages)
self.creat_and_enter_book_dir()
def creat_and_enter_book_dir(self):
curpath = os.getcwd()
newpath = curpath + os.path.sep + self.bookname
os.mkdir(newpath)
os.chdir(newpath)
def bookinfo(self):
url = f"https://lib-nuanxin.wqxuetang.com/v1/read/initread?bid={self.bookid}"
r = self.session.get(url)
info = json.loads(r.text)
data = info[‘data’]
return data[‘name’], data[‘canreadpages’]
def get_jwt_key(self):
url = "https://lib-nuanxin.wqxuetang.com/v1/read/k?bid=%s" % self.bookid
r = self.session.get(url, timeout=5)
j = json.loads(r.text)
return j[‘data’]
def get_jwt_token(self, page):
cur_time = time.time()
jwttoken = jwt.encode({
"p": page,
"t": int(cur_time)*1000,
"b": str(self.bookid),
"w": 1000,
"k": json.dumps(self.jwtkey),
"iat": int(cur_time)
}, self.jwt_secret, algorithm=’HS256′).decode(‘ascii’)
return jwttoken
async def download_img(self, page, task_id):
token = self.get_jwt_token(page)
url = f"https://lib-nuanxin.wqxuetang.com/page/img/{self.bookid}/{page}?k={token}"
headers = {
‘referer’: f’https://lib-nuanxin.wqxuetang.com/read/pdf/{self.bookid}’,
‘user-agent’: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36’
}
try:
async with aiohttp.ClientSession() as session:
async with session.get(url, headers=headers, timeout=5) as response:
r = await response.read()
except Exception:
logging.warning(f"线程 {task_id} -> 第 {page} 张下载失败, 重回队列!!!")
self.work_queue.put_nowait(page)
else:
if sys.getsizeof(r)==10433:
self.work_queue.put_nowait(page)
logging.warning(f"线程 {task_id} -> 第 {page} 张加载失败, 重回队列!!!")
else:
with open(f"{self.bookname+str(page)}.png", "wb") as f:
f.write(r)
logging.info(f"线程 {task_id} -> 第 {page} 张下载完成")
async def handle_tasks(self, task_id):
while not self.work_queue.empty():
page = await self.work_queue.get()
await self.download_img(page, task_id)
logging.info(f"线程 {task_id} 结束工作!~")
def main(self):
loop = asyncio.get_event_loop()
tasks =
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
if __name__ == "__main__":
bookid = int(sys.argv)
# 参数1:书号, 参数2:线程数
# 默认书名为下载文件夹名
w = Wqxuetang(bookid, 4)
w.main()
楼主,我发现好像有些下载下来的图片有问题,图片都显示加载中,但是这些下载异常的图片大小都一样,就简单粗暴加了个判断,不知楼主有没有啥好方法?
yjsx86 大佬有话说 :
vopcloud 大佬有话说 : 2020-2-2 22:46
楼主,我发现好像有些下载下来的图片有问题,图片都显示加载中,但是这些下载异常的图片大小都一样,就简 …
给个bookid我试试