base_spider.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. # File : base_spider.py
  4. # Author: DaShenHan&道长-----先苦后甜,任凭晚风拂柳颜------
  5. # Author's Blog: https://blog.csdn.net/qq_32394351
  6. # Date : 2024/1/7
  7. import os.path
  8. import sys
  9. sys.path.append('..')
  10. try:
  11. # from base.spider import Spider as BaseSpider
  12. from base.spider import BaseSpider
  13. except ImportError:
  14. from t4.base.spider import BaseSpider
  15. import json
  16. import time
  17. import base64
  18. import re
  19. from pathlib import Path
  20. import io
  21. import tokenize
  22. from urllib.parse import quote
  23. """
  24. 配置示例:
  25. t4的配置里ext节点会自动变成api对应query参数extend,但t4的ext字符串不支持路径格式,比如./开头或者.json结尾
  26. api里会自动含有ext参数是base64编码后的选中的筛选条件
  27. {
  28. "key":"hipy_t4_base_spider",
  29. "name":"base_spider(hipy_t4)",
  30. "type":4,
  31. "api":"http://192.168.31.49:5707/api/v1/vod/base_spider",
  32. "searchable":1,
  33. "quickSearch":0,
  34. "filterable":1,
  35. "ext":"base_spider"
  36. },
  37. {
  38. "key": "hipy_t3_base_spider",
  39. "name": "base_spider(hipy_t3)",
  40. "type": 3,
  41. "api": "{{host}}/txt/hipy/base_spider.py",
  42. "searchable": 1,
  43. "quickSearch": 0,
  44. "filterable": 1,
  45. "ext": "{{host}}/txt/hipy/base_spider.json"
  46. },
  47. """
  48. class Spider(BaseSpider): # 元类 默认的元类 type
  49. def getName(self):
  50. return "规则名称如:基础示例"
  51. def init_api_ext_file(self):
  52. """
  53. 这个函数用于初始化py文件对应的json文件,用于存筛选规则。
  54. 执行此函数会自动生成筛选文件
  55. @return:
  56. """
  57. ext_file = __file__.replace('.py', '.json')
  58. print(f'ext_file:{ext_file}')
  59. ext_file_dict = {
  60. "分类1": [{"key": "letter", "name": "首字母", "value": [{"n": "A", "v": "A"}, {"n": "B", "v": "B"}]}],
  61. "分类2": [{"key": "letter", "name": "首字母", "value": [{"n": "A", "v": "A"}, {"n": "B", "v": "B"}]},
  62. {"key": "year", "name": "年份",
  63. "value": [{"n": "2024", "v": "2024"}, {"n": "2023", "v": "2023"}]}],
  64. }
  65. with open(ext_file, mode='w+', encoding='utf-8') as f:
  66. f.write(json.dumps(ext_file_dict, ensure_ascii=False))
  67. def init(self, extend=""):
  68. """
  69. 初始化加载extend,一般与py文件名同名的json文件作为扩展筛选
  70. @param extend:
  71. @return:
  72. """
  73. def init_file(ext_file):
  74. """
  75. 根据与py对应的json文件去扩展规则的筛选条件
  76. """
  77. ext_file = Path(ext_file).as_posix()
  78. if os.path.exists(ext_file):
  79. with open(ext_file, mode='r', encoding='utf-8') as f:
  80. try:
  81. ext_dict = json.loads(f.read())
  82. self.config['filter'].update(ext_dict)
  83. except Exception as e:
  84. print(f'更新扩展筛选条件发生错误:{e}')
  85. ext = self.extend
  86. print(f"============ext:{ext},extend:{extend}============")
  87. if isinstance(ext, str) and ext:
  88. if ext.startswith('./'):
  89. ext_file = os.path.join(os.path.dirname(__file__), ext)
  90. init_file(ext_file)
  91. elif ext.startswith('http'):
  92. try:
  93. r = self.fetch(ext)
  94. self.config['filter'].update(r.json())
  95. except Exception as e:
  96. print(f'更新扩展筛选条件发生错误:{e}')
  97. elif not ext.startswith('./') and not ext.startswith('http'):
  98. ext_file = os.path.join(os.path.dirname(__file__), './' + ext + '.json')
  99. init_file(ext_file)
  100. # 装载模块,这里只要一个就够了
  101. if isinstance(extend, list):
  102. for lib in extend:
  103. if '.Spider' in str(type(lib)):
  104. self.module = lib
  105. break
  106. def isVideoFormat(self, url):
  107. pass
  108. def manualVideoCheck(self):
  109. pass
  110. def homeContent(self, filterable=False):
  111. """
  112. 获取首页分类及筛选数据
  113. @param filterable: 能否筛选,跟t3/t4配置里的filterable参数一致
  114. @return:
  115. """
  116. class_name = '电影&电视剧&综艺&动漫' # 静态分类名称拼接
  117. class_url = '1&2&3&4' # 静态分类标识拼接
  118. result = {}
  119. classes = []
  120. if all([class_name, class_url]):
  121. class_names = class_name.split('&')
  122. class_urls = class_url.split('&')
  123. cnt = min(len(class_urls), len(class_names))
  124. for i in range(cnt):
  125. classes.append({
  126. 'type_name': class_names[i],
  127. 'type_id': class_urls[i]
  128. })
  129. result['class'] = classes
  130. if filterable:
  131. result['filters'] = self.config['filter']
  132. return result
  133. def homeVideoContent(self):
  134. """
  135. 首页推荐列表
  136. @return:
  137. """
  138. d = []
  139. d.append({
  140. 'vod_name': '测试',
  141. 'vod_id': 'index.html',
  142. 'vod_pic': 'https://gitee.com/CherishRx/imagewarehouse/raw/master/image/13096725fe56ce9cf643a0e4cd0c159c.gif',
  143. 'vod_remarks': '原始hipy',
  144. })
  145. result = {
  146. 'list': d
  147. }
  148. return result
  149. def categoryContent(self, tid, pg, filterable, extend):
  150. """
  151. 返回一级列表页数据
  152. @param tid: 分类id
  153. @param pg: 当前页数
  154. @param filterable: 能否筛选
  155. @param extend: 当前筛选数据
  156. @return:
  157. """
  158. page_count = 24 # 默认赋值一页列表24条数据
  159. d = []
  160. d.append({
  161. 'vod_name': '测试',
  162. 'vod_id': 'index.html',
  163. 'vod_pic': 'https://gitee.com/CherishRx/imagewarehouse/raw/master/image/13096725fe56ce9cf643a0e4cd0c159c.gif',
  164. 'vod_remarks': '类型:' + tid,
  165. })
  166. result = {
  167. 'list': d,
  168. 'page': pg,
  169. 'pagecount': 9999 if len(d) >= page_count else pg,
  170. 'limit': 90,
  171. 'total': 999999,
  172. }
  173. return result
  174. def detailContent(self, ids):
  175. """
  176. 返回二级详情页数据
  177. @param ids: 一级传过来的vod_id列表
  178. @return:
  179. """
  180. vod_id = ids[0]
  181. vod = {"vod_id": vod_id,
  182. "vod_name": '测试二级',
  183. "vod_pic": 'https://gitee.com/CherishRx/imagewarehouse/raw/master/image/13096725fe56ce9cf643a0e4cd0c159c.gif',
  184. "type_name": '详情页类型',
  185. "vod_year": '详情页年份',
  186. "vod_area": '详情页地区',
  187. "vod_remarks": '详情页标签',
  188. "vod_actor": '详情页演员名称',
  189. "vod_director": '详情页导演名称',
  190. "vod_content": '详情页剧情描述',
  191. "vod_play_from": '测试线路1$$$测试线路2',
  192. "vod_play_url": '选集播放1$1.mp4#选集播放2$2.mp4$$$选集播放3$3.mp4#选集播放4$4.mp4'}
  193. result = {
  194. 'list': [vod]
  195. }
  196. return result
  197. def searchContent(self, wd, quick=False, pg=1):
  198. """
  199. 返回搜索列表
  200. @param wd: 搜索关键词
  201. @param quick: 是否来自快速搜索。t3/t4配置里启用了快速搜索,在快速搜索在执行才会是True
  202. @return:
  203. """
  204. headers = {
  205. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.54 Safari/537.36",
  206. "Host": "www.bttwo.net",
  207. "Referer": "https://www.bttwo.net/"
  208. }
  209. url = f'https://www.bttwo.net/xssearch?q={quote(wd)}'
  210. r = self.fetch(url, headers=headers)
  211. cookies = ['myannoun=1']
  212. for key, value in r.headers.items():
  213. if str(key).lower() == 'set-cookie':
  214. cookies.append(value.split(';')[0])
  215. new_headers = {
  216. 'Cookie': ';'.join(cookies),
  217. # 'Pragma': 'no-cache',
  218. # 'Origin': 'https://www.bttwo.net',
  219. # 'Referer': url,
  220. # 'Sec-Ch-Ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
  221. # 'Sec-Ch-Ua-Mobile': '?0',
  222. # 'Sec-Ch-Ua-Platform': '"Windows"',
  223. # 'Sec-Fetch-Dest': 'document',
  224. # 'Sec-Fetch-Mode': 'navigate',
  225. # 'Sec-Fetch-Site': 'same-origin',
  226. # 'Sec-Fetch-User': '?1',
  227. # 'Upgrade-Insecure-Requests': '1',
  228. }
  229. headers.update(new_headers)
  230. print(headers)
  231. html = self.html(r.text)
  232. captcha = ''.join(html.xpath('//*[@class="erphp-search-captcha"]/form/text()')).strip()
  233. print('验证码:', captcha)
  234. answer = self.eval_computer(captcha)
  235. print('回答:', captcha, answer)
  236. data = {'result': str(answer)}
  237. print('待post数据:', data)
  238. self.post(url, data=data, headers=headers, cookies=None)
  239. r = self.fetch(url, headers=headers)
  240. # print(r.text)
  241. html = self.html(r.text)
  242. lis = html.xpath('//*[contains(@class,"search_list")]/ul/li')
  243. print('搜索结果数:', len(lis))
  244. d = []
  245. if len(lis) < 1:
  246. d.append({
  247. 'vod_name': wd,
  248. 'vod_id': 'index.html',
  249. 'vod_pic': 'https://gitee.com/CherishRx/imagewarehouse/raw/master/image/13096725fe56ce9cf643a0e4cd0c159c.gif',
  250. 'vod_remarks': '测试搜索',
  251. })
  252. else:
  253. for li in lis:
  254. d.append({
  255. 'vod_name': ''.join(li.xpath('h3//text()')),
  256. 'vod_id': ''.join(li.xpath('a/@href')),
  257. 'vod_pic': ''.join(li.xpath('a/img/@data-original')),
  258. 'vod_remarks': ''.join(li.xpath('p//text()')),
  259. })
  260. result = {
  261. 'list': d
  262. }
  263. print(result)
  264. return result
  265. def playerContent(self, flag, id, vipFlags):
  266. """
  267. 解析播放,返回json。壳子视情况播放直链或进行嗅探
  268. @param flag: vod_play_from 播放来源线路
  269. @param id: vod_play_url 播放的链接
  270. @param vipFlags: vip标识
  271. @return:
  272. """
  273. # url = 'http://bizcommon.alicdn.com/l2nDqpMmn6DGHnWzZQA/Cg9qI5imMInpPvK5Mnm%40%40hd.m3u8'
  274. url = 'https://s1.bfzycdn.com/video/renmindemingyi/%E7%AC%AC07%E9%9B%86/index.m3u8'
  275. parse = 0
  276. headers = {
  277. 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'
  278. }
  279. result = {
  280. 'parse': parse, # 1=嗅探,0=播放
  281. 'playUrl': '', # 解析链接
  282. 'url': url, # 直链或待嗅探地址
  283. 'header': headers, # 播放UA
  284. }
  285. return result
  286. @staticmethod
  287. def adRemove():
  288. return 'reg:/video/adjump.*?ts'
  289. config = {
  290. "player": {},
  291. "filter": {}
  292. }
  293. header = {
  294. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.54 Safari/537.36",
  295. "Host": "www.baidu.com",
  296. "Referer": "https://www.baidu.com/"
  297. }
  298. def localProxy(self, params):
  299. # http://192.168.31.49:5707/api/v1/vod/哔滴影视?proxy=1&do=py&type=1.m3u8
  300. print(params)
  301. content = """
  302. #EXTM3U
  303. #EXT-X-VERSION:3
  304. #EXT-X-ALLOW-CACHE:YES
  305. #EXT-X-MEDIA-SEQUENCE:170471784
  306. #EXT-X-TARGETDURATION:10
  307. #EXT-X-PROGRAM-DATE-TIME:2024-01-11T20:43:53+08:00
  308. #EXTINF:10.000, no desc
  309. http://gctxyc.liveplay.myqcloud.com/gc/gllj01_1_md-170471784.ts
  310. #EXT-X-PROGRAM-DATE-TIME:2024-01-11T20:44:03+08:00
  311. #EXTINF:10.000, no desc
  312. http://gctxyc.liveplay.myqcloud.com/gc/gllj01_1_md-170471785.ts
  313. #EXT-X-PROGRAM-DATE-TIME:2024-01-11T20:44:13+08:00
  314. #EXTINF:10.000, no desc
  315. http://gctxyc.liveplay.myqcloud.com/gc/gllj01_1_md-170471786.ts
  316. #EXT-X-PROGRAM-DATE-TIME:2024-01-11T20:44:23+08:00
  317. #EXTINF:10.000, no desc
  318. http://gctxyc.liveplay.myqcloud.com/gc/gllj01_1_md-170471787.ts
  319. """.strip()
  320. return [200, 'text/plain', content]
  321. # return [404, 'text/plain', 'Not Found']
  322. # return [200, "video/MP2T", content]
  323. # return [200, "video/MP2T", ""]
  324. # -----------------------------------------------自定义函数-----------------------------------------------
  325. def eval_computer(self, text):
  326. """
  327. 自定义的字符串安全计算器
  328. @param text:字符串的加减乘除
  329. @return:计算后得到的值
  330. """
  331. localdict = {}
  332. self.safe_eval(f'ret={text.replace("=", "")}', localdict)
  333. ret = localdict.get('ret') or None
  334. return ret
  335. def safe_eval(self, code: str = '', localdict: dict = None):
  336. code = code.strip()
  337. if not code:
  338. return {}
  339. if localdict is None:
  340. localdict = {}
  341. builtins = __builtins__
  342. if not isinstance(builtins, dict):
  343. builtins = builtins.__dict__.copy()
  344. else:
  345. builtins = builtins.copy()
  346. for key in ['__import__', 'eval', 'exec', 'globals', 'dir', 'copyright', 'open', 'quit']:
  347. del builtins[key] # 删除不安全的关键字
  348. # print(builtins)
  349. global_dict = {'__builtins__': builtins,
  350. 'json': json, 'print': print,
  351. 're': re, 'time': time, 'base64': base64
  352. } # 禁用内置函数,不允许导入包
  353. try:
  354. self.check_unsafe_attributes(code)
  355. exec(code, global_dict, localdict)
  356. return localdict
  357. except Exception as e:
  358. return {'error': f'执行报错:{e}'}
  359. # ==================== 静态函数 ======================
  360. @staticmethod
  361. def check_unsafe_attributes(string):
  362. """
  363. 安全检测需要exec执行的python代码
  364. :param string:
  365. :return:
  366. """
  367. g = tokenize.tokenize(io.BytesIO(string.encode('utf-8')).readline)
  368. pre_op = ''
  369. for toktype, tokval, _, _, _ in g:
  370. if toktype == tokenize.NAME and pre_op == '.' and tokval.startswith('_'):
  371. attr = tokval
  372. msg = "access to attribute '{0}' is unsafe.".format(attr)
  373. raise AttributeError(msg)
  374. elif toktype == tokenize.OP:
  375. pre_op = tokval
  376. if __name__ == '__main__':
  377. spider = Spider()
  378. spider.init()
  379. # spider.init_api_ext_file() # 生成筛选对应的json文件
  380. spider.log({'key': 'value'})
  381. spider.log('====文本内容====')
  382. with open('test_1.txt', encoding='utf-8') as f:
  383. code = f.read()
  384. a = spider.superStr2dict(code)
  385. print(type(a), a)
  386. # spider.searchContent('斗罗大陆')
  387. print(spider.playerContent(None, 1, None))
  388. with open('ad.m3u8', encoding='utf-8') as f:
  389. adt = f.read()
  390. url = adt.split('\n')[0]
  391. adt = '\n'.join(adt.split('\n')[1:])
  392. ad_remove = 'reg:/video/adjump(.*?)ts'
  393. print(spider.fixAdM3u8(adt, url, ad_remove))