asyncSnifferPro.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. # File : asyncSnifferPro.py
  4. # Author: DaShenHan&道长-----先苦后甜,任凭晚风拂柳颜------
  5. # Date : 2024/3/28
  6. # desc 利用playwright实现的简易播放地址嗅探器,异步高性能
  7. from playwright.async_api import async_playwright, Playwright
  8. import re
  9. import os
  10. from urllib.parse import urlparse
  11. from time import time
  12. import asyncio
  13. _description = r"""
  14. pip install playwright
  15. 手动安装谷歌浏览器即可,不需要playwright install,因为它自带的三个浏览器都太垃圾了不好用
  16. 参考官方接口文档
  17. https://playwright.dev/python/docs/intro
  18. https://playwright.dev/python/docs/api/class-playwright
  19. """
  20. # 储存驱动器列表,给接口缓存用
  21. browser_drivers = []
  22. # 全部毫秒为单位不需要转换
  23. class Sniffer:
  24. # 正则嗅探匹配表达式
  25. urlRegex: str = 'http((?!http).){12,}?\\.(m3u8|mp4|flv|avi|mkv|rm|wmv|mpg|m4a|mp3)\\?.*|http((?!http).){12,}\\.(m3u8|mp4|flv|avi|mkv|rm|wmv|mpg|m4a|mp3)|http((?!http).)*?video/tos*'
  26. urlNoHead: str = 'http((?!http).){12,}?(ac=dm&url=)'
  27. # 每次嗅探间隔毫秒
  28. playwright = None
  29. browser = None
  30. main_page = None
  31. context = None
  32. requests = None
  33. user_agent: str = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
  34. pages = []
  35. web_timeout = 15000 # 获取页面源码不超过10秒
  36. sniffer_timeout = 20000 # 嗅探网页不超20秒
  37. wait_timeout = 3000 # 等待时间不超过3秒
  38. def __init__(self,
  39. timeout=10000, head_timeout=200, user_agent=None,
  40. custom_regex=None, headless=True, debug=False, use_chrome=True, is_pc=False):
  41. """
  42. 初始化
  43. @param timeout: 全局嗅探超时
  44. @param head_timeout: head访问超时
  45. @param user_agent: 默认请求头
  46. @param custom_regex: 自定义嗅探正则
  47. @param headless: 无头模式
  48. @param debug: 启用debug打印日志
  49. """
  50. if user_agent is not None:
  51. self.user_agent = user_agent
  52. self.timeout = timeout
  53. self.head_timeout = head_timeout
  54. self.debug = debug
  55. self.custom_regex = custom_regex
  56. self.headless = headless
  57. self.channel = "chrome" if use_chrome else None
  58. self.is_pc = is_pc
  59. def log(self, *args):
  60. """
  61. 打印日志print函数|根据类的实例化是否传入debug=True进行开启打印
  62. @param args:
  63. @return:
  64. """
  65. if self.debug:
  66. print(*args)
  67. async def __aenter__(self):
  68. # 在进入上下文管理器时调用异步函数
  69. print('在进入上下文管理器时调用异步函数')
  70. self.browser = await self.init_browser()
  71. return self
  72. async def __aexit__(self, exc_type, exc_value, traceback):
  73. # 在上下文管理器退出时执行清理工作
  74. print('在上下文管理器退出时执行清理工作')
  75. pass
  76. async def init_browser(self):
  77. """
  78. 初始化驱动程序
  79. @return:
  80. """
  81. self.playwright = await async_playwright().start()
  82. # 用手动安装的chrome浏览器。不用它自带的三个垃圾浏览器
  83. browser = await self.playwright.chromium.launch(channel=self.channel, headless=self.headless)
  84. # print(self.playwright.devices)
  85. if not self.is_pc:
  86. # 模拟使用苹果手机
  87. iphone = self.playwright.devices["iPhone 14 Pro"]
  88. context = await browser.new_context(**iphone)
  89. else:
  90. context = await browser.new_context()
  91. # 开启一个主窗口方便后续的page新开和关闭不会退出程序
  92. self.main_page = await context.new_page()
  93. # 上下文自带的request库,跟requests库有点类似,但是用法也有差别
  94. self.requests = context.request
  95. # return browser
  96. return context
  97. async def setCookie(self, page, cookie=''):
  98. """
  99. 设置cookie。可以在嗅探前或者获取源码前设置
  100. @param _dict:
  101. @return:
  102. """
  103. await page.set_extra_http_headers(headers={'Cookie': cookie})
  104. @staticmethod
  105. async def _route_interceptor(route):
  106. """
  107. 全局路由拦截器,禁止加载某些资源
  108. @param route:
  109. @return:
  110. """
  111. excluded_resource_types = ["stylesheet", "image", "font"]
  112. resource_type = route.request.resource_type
  113. # print(resource_type)
  114. if resource_type in excluded_resource_types:
  115. # print('禁止加载资源:', excluded_resource_types, route.request.url, route.request.resource_type)
  116. await route.abort()
  117. else:
  118. await route.continue_()
  119. @staticmethod
  120. async def _on_dialog(dialog):
  121. """
  122. 全局弹窗拦截器
  123. @param dialog:
  124. @return:
  125. """
  126. # print('on_dialog:', dialog)
  127. await dialog.accept()
  128. @staticmethod
  129. async def _on_pageerror(error):
  130. """
  131. 全局页面请求错误拦截器
  132. @param error:
  133. @return:
  134. """
  135. # print('on_pageerror:', error)
  136. pass
  137. async def _get_page(self, headers=None):
  138. """
  139. 新建一个页面。注入好相关依赖
  140. @param headers:
  141. @return:
  142. """
  143. page = await self.browser.new_page()
  144. # 设置全局导航超时
  145. page.set_default_navigation_timeout(self.timeout)
  146. # 设置全局等待超时
  147. page.set_default_timeout(self.timeout)
  148. # 添加初始化脚本 提高速度并且过无法播放的验证
  149. await page.add_init_script(path=os.path.join(os.path.dirname(__file__), './stealth.min.js'))
  150. await page.add_init_script(path=os.path.join(os.path.dirname(__file__), './devtools.js'))
  151. # 屏蔽控制台监听器 https://cdn.staticfile.net/devtools-detector/2.0.14/devtools-detector.min.js
  152. await page.route(re.compile(r"devtools-detector.*\.js$"), lambda route: route.abort())
  153. # 设置请求头
  154. if headers is not None:
  155. await page.set_extra_http_headers(headers=headers)
  156. else:
  157. await page.set_extra_http_headers(headers={'user-agent': self.user_agent})
  158. # 打开静态资源拦截器
  159. await page.route(re.compile(r"\.(png|jpg|jpeg|css|ttf)$"), self._route_interceptor)
  160. # await page.route(re.compile(r"\.(png|jpg|jpeg|ttf)$"), self._route_interceptor)
  161. # 打开弹窗拦截器
  162. page.on("dialog", self._on_dialog)
  163. # 打开页面错误监听
  164. page.on("pageerror", self._on_pageerror)
  165. # page.set_viewport_size({"width": 360, "height": 540})
  166. # 加入页面列表
  167. self.pages.append(page)
  168. return page
  169. @staticmethod
  170. def remove_element(array, element):
  171. """
  172. 移除列表指定元素
  173. @param array:
  174. @param element:
  175. @return:
  176. """
  177. new_array = [x for x in array if x != element]
  178. return new_array
  179. async def close_page(self, page):
  180. """
  181. 移除页面储存列表的指定page
  182. @param page:
  183. @return:
  184. """
  185. self.pages = self.remove_element(self.pages, page)
  186. await page.close()
  187. self.log('成功关闭page')
  188. async def close(self):
  189. """
  190. 用完记得关闭驱动器
  191. @return:
  192. """
  193. await self.main_page.close()
  194. await self.browser.close()
  195. await self.playwright.stop()
  196. async def fetCodeByWebView(self, url, headers=None, timeout=None, is_pc=False, css=None, script=None):
  197. """
  198. 利用webview请求得到渲染完成后的源码
  199. @param url: 待获取源码的网页链接
  200. @param headers: 访问网页的浏览器自定义请求头
  201. @param timeout: 访问网页的超时毫秒数
  202. @param is_pc: 是否用PC.此配置不生效。默认手机访问
  203. @param css: 等待出现定位器|如果不传css就等待加载页面状态为load
  204. @param script: 页面状态加载完毕后执行的网页脚本,可以点击网页元素之类的
  205. @return:
  206. """
  207. t1 = time()
  208. if timeout is None:
  209. timeout = self.timeout
  210. else:
  211. timeout = min([timeout, self.web_timeout])
  212. if not is_pc:
  213. is_pc = self.is_pc
  214. do_css = str(css).strip() if css and str(css).strip() else False
  215. page = await self._get_page(headers)
  216. # 设置全局导航超时
  217. page.set_default_navigation_timeout(timeout)
  218. # 设置全局等待超时
  219. page.set_default_timeout(timeout)
  220. response = {'content': '', 'headers': {'location': url}}
  221. try:
  222. await page.goto(url)
  223. except Exception as e:
  224. self.log(f'发生了错误:{e}')
  225. else:
  226. if do_css:
  227. await page.wait_for_selector(do_css)
  228. else:
  229. await page.wait_for_load_state('load')
  230. if script:
  231. try:
  232. await page.evaluate("""(script) => {
  233. eval(script);
  234. }
  235. """, script)
  236. self.log(f'网页加载完成后成功执行脚本:{script}')
  237. except Exception as e:
  238. self.log(f'网页加载完成后执行脚本:{script}发生错误:{e}')
  239. response['content'] = await page.content()
  240. response['headers']['location'] = page.url
  241. t2 = time()
  242. cost = round((t2 - t1) * 1000, 2)
  243. response['cost'] = cost
  244. await self.close_page(page)
  245. return response
  246. async def snifferMediaUrl(self, playUrl, mode=0, custom_regex=None, timeout=None, css=None, is_pc=False,
  247. headers=None,
  248. script=None):
  249. """
  250. 输入播放地址,返回嗅探到的真实视频链接
  251. @param playUrl: 待嗅探的视频播放也地址
  252. @param mode: 模式:0 嗅探到一个就返回 1:在10秒内嗅探所有的返回列表
  253. @param custom_regex: 自定义嗅探正则
  254. @param timeout: 超时
  255. @param css: 等待出现定位器|如果不传css并且传了script就等待加载页面状态为load
  256. @param is_pc: 是否用PC.此配置不生效。默认手机访问
  257. @param headers: 访问网页的浏览器自定义请求头
  258. @param script: 页面状态加载完毕后执行的网页脚本,可以点击网页元素之类的
  259. @return:
  260. """
  261. t1 = time()
  262. if custom_regex is None:
  263. custom_regex = self.custom_regex
  264. if not is_pc:
  265. is_pc = self.is_pc
  266. do_css = str(css).strip() if css and str(css).strip() else False
  267. realUrls = [] # 真实链接列表,用于mode=1场景
  268. headUrls = [] # 已经head请求过的链接
  269. page = await self._get_page(headers)
  270. if timeout is None:
  271. timeout = self.timeout
  272. else:
  273. if mode == 1: # 嗅探所有超时设定在10s内
  274. timeout = min([timeout, self.timeout])
  275. else:
  276. # 其他单个自定义时间不得超过20s
  277. timeout = min([timeout, self.sniffer_timeout])
  278. async def _on_request(request):
  279. nonlocal realUrls, headUrls
  280. url = request.url
  281. method = request.method
  282. headers = request.headers
  283. resource_type = request.resource_type
  284. self.log('on_request:', url, ' method:', method, ' resource_type:', resource_type)
  285. if custom_regex and re.search(custom_regex, url, re.M | re.I):
  286. _headers = {}
  287. if headers.get('referer'):
  288. _headers['referer'] = headers['referer']
  289. if headers.get('user-agent'):
  290. _headers['user-agent'] = headers['user-agent']
  291. realUrls.append({
  292. 'url': url,
  293. 'headers': _headers
  294. })
  295. await page.evaluate("""([url, _headers,realUrls]) => {
  296. window.realUrl = url
  297. window.realHeaders = _headers
  298. window.realUrls = realUrls
  299. }
  300. """, [url, _headers, realUrls])
  301. self.log('on_request通过custom_regex嗅探到真实地址:', url)
  302. if mode == 0:
  303. page.remove_listener("request", _on_request)
  304. return True
  305. if re.search(self.urlRegex, url, re.M | re.I):
  306. if url.find('url=http') < 0 and url.find('v=http') < 0 and url.find('.css') < 0 and url.find(
  307. '.html') < 0:
  308. _headers = {}
  309. if headers.get('referer'):
  310. _headers['referer'] = headers['referer']
  311. if headers.get('user-agent'):
  312. _headers['user-agent'] = headers['user-agent']
  313. realUrls.append({
  314. 'url': url,
  315. 'headers': _headers
  316. })
  317. await page.evaluate("""([url, _headers,realUrls]) => {
  318. window.realUrl = url
  319. window.realHeaders = _headers
  320. window.realUrls = realUrls
  321. }
  322. """, [url, _headers, realUrls])
  323. self.log('on_request通过默认正则已嗅探到真实地址:', url)
  324. if mode == 0:
  325. page.remove_listener("request", _on_request)
  326. return True
  327. elif str(method).lower() == 'get' and str(url).startswith('http') and url != playUrl:
  328. parsed_url = urlparse(url)
  329. path = parsed_url.path
  330. filename = str(path.split('/')[-1])
  331. # 链接不含.并且正则匹配不在不head列表 或者 链接有.但是.后面没内容,也算空后缀
  332. if (filename and '.' not in filename and not re.search(self.urlNoHead, url, re.M | re.I)) or (
  333. '.' in filename and len(filename) > 1 and not filename.split('.')[1]) and resource_type not in [
  334. 'script']:
  335. # 如果链接没有进行过head请求。防止多次嗅探的时候重复去head请求
  336. if url not in headUrls:
  337. try:
  338. r = await self.requests.head(url=url, timeout=self.head_timeout)
  339. rheaders = r.headers
  340. if rheaders.get('content-type') and rheaders[
  341. 'content-type'] == 'application/octet-stream' and '.m3u8' in rheaders[
  342. 'content-disposition']:
  343. _headers = {}
  344. if headers.get('referer'):
  345. _headers['referer'] = headers['referer']
  346. if headers.get('user-agent'):
  347. _headers['user-agent'] = headers['user-agent']
  348. realUrls.append({
  349. 'url': url,
  350. 'headers': _headers
  351. })
  352. await page.evaluate("""([url, _headers,realUrls]) => {
  353. window.realUrl = url
  354. window.realHeaders = _headers
  355. window.realUrls = realUrls
  356. }
  357. """, [url, _headers, realUrls])
  358. self.log('on_request通过head请求嗅探到真实地址:', url)
  359. if mode == 0:
  360. page.remove_listener("request", _on_request)
  361. return True
  362. except Exception as e:
  363. print(f'head请求访问: {url} 发生了错误:{e}')
  364. headUrls.append(url)
  365. page.on('request', _on_request)
  366. # 设置全局导航超时
  367. page.set_default_navigation_timeout(timeout)
  368. # 设置全局等待超时
  369. page.set_default_timeout(timeout)
  370. await page.expose_function("log", lambda *args: print(*args))
  371. await page.add_init_script(path=os.path.join(os.path.dirname(__file__), './preload.js'))
  372. await page.evaluate("""
  373. window.realUrl = ''
  374. window.realHeaders = {}
  375. window.realUrls = []
  376. """)
  377. try:
  378. await page.goto(playUrl)
  379. except Exception as e:
  380. self.log('嗅探发生错误:', e)
  381. t2 = time()
  382. cost = round((t2 - t1) * 1000, 2)
  383. return {'url': '', 'headers': {}, 'from': playUrl, 'cost': cost, 'code': 404,
  384. 'msg': f'嗅探失败:{e}'}
  385. # 这里不需要另外分支去判断状态为load,因为嗅探无需等待页面加载完毕。异步就行。一般也不传css
  386. if do_css:
  387. await page.wait_for_selector(do_css)
  388. if script:
  389. if not do_css:
  390. await page.wait_for_load_state('load')
  391. try:
  392. await page.evaluate("""(script) => {
  393. eval(script);
  394. }
  395. """, script)
  396. self.log(f'网页加载完成后成功执行脚本:{script}')
  397. except Exception as e:
  398. self.log(f'网页加载完成后执行脚本:{script}发生错误:{e}')
  399. is_timeout = False
  400. if mode == 0:
  401. try:
  402. await page.wait_for_function("() => window.realUrl")
  403. except:
  404. is_timeout = True
  405. elif mode == 1:
  406. try:
  407. await page.wait_for_timeout(timeout)
  408. except:
  409. is_timeout = True
  410. realUrl = await page.evaluate('window.realUrl')
  411. realHeaders = await page.evaluate('window.realHeaders')
  412. realUrls = await page.evaluate('window.realUrls')
  413. t2 = time()
  414. cost = round((t2 - t1) * 1000, 2)
  415. cost_str = f'{cost} ms'
  416. self.log(f'共计耗时{cost}毫秒|{"已超时" if is_timeout else "未超时"}')
  417. self.log('realUrl:', realUrl)
  418. self.log('realHeaders:', realHeaders)
  419. await self.close_page(page)
  420. if mode == 0 and realUrl:
  421. return {'url': realUrl, 'headers': realHeaders, 'from': playUrl, 'cost': cost_str, 'code': 200,
  422. 'msg': '超级嗅探解析成功'}
  423. elif mode == 1 and realUrls:
  424. return {'urls': realUrls, 'code': 200, 'from': playUrl, 'cost': cost_str, 'msg': '超级嗅探解析成功'}
  425. else:
  426. return {'url': realUrl, 'headers': realHeaders, 'from': playUrl, 'cost': cost_str, 'code': 404,
  427. 'msg': '超级嗅探解析失败'}
  428. async def main_test():
  429. t1 = time()
  430. urls = [
  431. # 'https://www.cs1369.com/play/2-1-94.html',
  432. # 'https://m.ting13.cc/play/19176_1_91258.html',
  433. 'https://v.qq.com/x/page/i3038urj2mt.html',
  434. 'http://www.mgtv.com/v/1/290346/f/3664551.html',
  435. 'https://jx.jsonplayer.com/player/?url=https://m.iqiyi.com/v_1pj3ayb1n70.html',
  436. 'https://jx.yangtu.top/?url=https://m.iqiyi.com/v_1pj3ayb1n70.html',
  437. ]
  438. _count = 0
  439. async with Sniffer(debug=True, headless=True) as browser:
  440. # 在这里,async_func已被调用并已完成
  441. pass
  442. for url in urls:
  443. _count += 1
  444. ret = await browser.snifferMediaUrl(url, timeout=15000)
  445. print(ret)
  446. _count += 1
  447. ret = await browser.snifferMediaUrl('https://jx.yangtu.top/?url=https://m.iqiyi.com/v_1pj3ayb1n70.html',
  448. custom_regex='http((?!http).){12,}?(download4|pcDownloadFile)')
  449. print(ret)
  450. # _count += 1
  451. # ret = await browser.fetCodeByWebView('https://www.freeok.pro/xplay/63170-8-12.html')
  452. # print(ret)
  453. await browser.close()
  454. t2 = time()
  455. print(f'嗅探{_count}个页面共计耗时:{round(t2 - t1, 2)}s')
  456. if __name__ == '__main__':
  457. asyncio.run(main_test())