|
@@ -12,6 +12,7 @@ import base64
|
|
|
from urllib.parse import urljoin
|
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
|
from pprint import pprint
|
|
|
+import time
|
|
|
|
|
|
import requests
|
|
|
|
|
@@ -41,6 +42,14 @@ def compress_and_encode(data: str):
|
|
|
return encoded_data
|
|
|
|
|
|
|
|
|
+def decode_and_decompress(encoded_data: str):
|
|
|
+ # 解码Base64数据
|
|
|
+ decoded_data = base64.b64decode(encoded_data.encode('utf-8'))
|
|
|
+ # 解压缩数据
|
|
|
+ decompressed_data = gzip.decompress(decoded_data).decode('utf-8')
|
|
|
+ return decompressed_data
|
|
|
+
|
|
|
+
|
|
|
def get_classes(rec):
|
|
|
classes = None
|
|
|
if rec.get('url') and str(rec['url']).startswith('http'):
|
|
@@ -96,6 +105,31 @@ def get_convert_classes(rec):
|
|
|
return classes
|
|
|
|
|
|
|
|
|
+def check_class(api, type_name, type_id, limit_count=6):
|
|
|
+ _url = f'{api}?ac=detail&pg=1&t={type_id}'
|
|
|
+ try:
|
|
|
+ r = requests.get(_url, headers=headers, timeout=timeout, verify=False)
|
|
|
+ ret = r.json()
|
|
|
+ if not ret.get("list") or len(ret["list"]) < limit_count:
|
|
|
+ print(f'获取资源 {api} 分类【{type_name}】数量为:{len(ret["list"])} 小于{limit_count}视为排除')
|
|
|
+ return False
|
|
|
+ except Exception as e:
|
|
|
+ print(f'获取资源 {_url} 分类【{type_name}】发生错误:{e}')
|
|
|
+ return True
|
|
|
+
|
|
|
+
|
|
|
+def check_active(api):
|
|
|
+ try:
|
|
|
+ r = requests.get(api, headers=headers, timeout=timeout, verify=False)
|
|
|
+ ret = r.json()
|
|
|
+ if not ret.get("class"):
|
|
|
+ return False
|
|
|
+ except Exception as e:
|
|
|
+ print(f'检查api: {api} 存活发生错误:{e}')
|
|
|
+ return False
|
|
|
+ return True
|
|
|
+
|
|
|
+
|
|
|
def main(fname='采集'):
|
|
|
file_path = f'./{fname}.json'
|
|
|
out_file_path = file_path.replace('.json', '静态.json')
|
|
@@ -111,7 +145,7 @@ def main(fname='采集'):
|
|
|
tasks = [pool.submit(get_convert_classes, rec) for rec in records] # 构造一个列表,循环向线程池内submit提交执行的方法
|
|
|
pool.shutdown(wait=True) # 线程数等待所有线程结束,这里 卡住主线程
|
|
|
results = [task.result() for task in tasks]
|
|
|
- pprint(results)
|
|
|
+ print(results)
|
|
|
new_records = []
|
|
|
for record in records:
|
|
|
rec_name = record["name"]
|
|
@@ -126,8 +160,67 @@ def main(fname='采集'):
|
|
|
f.write(json.dumps(new_records, ensure_ascii=False, indent=2))
|
|
|
|
|
|
|
|
|
+def main_exclude(fname='采集静态', max_workers=0):
|
|
|
+ file_path = f'./{fname}.json'
|
|
|
+ if not os.path.exists(file_path):
|
|
|
+ exit(f'不存在采集文件路径:{file_path}')
|
|
|
+ with open(file_path, encoding='utf-8') as f:
|
|
|
+ data = f.read()
|
|
|
+ records = json.loads(data)
|
|
|
+ if len(records) < 1 or not records[0].get('class_name'):
|
|
|
+ exit('输入数据有误,疑似不是静态数据')
|
|
|
+ print(records)
|
|
|
+ new_records = []
|
|
|
+ for rec in records:
|
|
|
+ new_rec = rec.copy()
|
|
|
+ if rec.get('api'):
|
|
|
+ api_url = urljoin(rec['url'], rec['api'])
|
|
|
+ else:
|
|
|
+ api_url = urljoin(rec['url'], '/api.php/provide/vod/')
|
|
|
+ print(api_url)
|
|
|
+ cate_excludes = []
|
|
|
+ if not check_active(api_url):
|
|
|
+ print(f'{rec["name"]} ({rec["url"]})视为不存活,跳过分类检测')
|
|
|
+ else:
|
|
|
+ class_names = decode_and_decompress(rec['class_name']).split('&')
|
|
|
+ class_urls = rec['class_url'].split('&')
|
|
|
+ rec_pool = ThreadPoolExecutor(max_workers=max_workers or len(class_names)) # 初始化线程池内线程数量为分类数量
|
|
|
+ tasks = []
|
|
|
+ for i in range(len(class_names)):
|
|
|
+ type_name = class_names[i]
|
|
|
+ type_id = class_urls[i]
|
|
|
+ tasks.append(rec_pool.submit(check_class, api_url, type_name, type_id))
|
|
|
+ rec_pool.shutdown(wait=True) # 线程数等待所有线程结束,这里 卡住主线程
|
|
|
+ results = [task.result() for task in tasks]
|
|
|
+ print(results)
|
|
|
+ for i in range(len(class_names)):
|
|
|
+ type_name = class_names[i]
|
|
|
+ # type_id = class_urls[i]
|
|
|
+ if not results[i]:
|
|
|
+ cate_excludes.append(type_name)
|
|
|
+ if len(cate_excludes) > 0:
|
|
|
+ new_rec['cate_excludes'] = cate_excludes
|
|
|
+ new_records.append(new_rec)
|
|
|
+
|
|
|
+ with open(file_path, mode='w+', encoding='utf-8') as f:
|
|
|
+ f.write(json.dumps(new_records, ensure_ascii=False, indent=2))
|
|
|
+
|
|
|
+
|
|
|
if __name__ == '__main__':
|
|
|
use_gzip = True
|
|
|
- fname = str(input('请输入文件名,留空默认为采集:\n'))
|
|
|
- fname = fname or '采集'
|
|
|
- main(fname)
|
|
|
+ fmode = str(input('请输入处理文件方式(0:生成分类 1:添加分类过滤),留空默认为生成静态分类:\n'))
|
|
|
+ ftips = '采集静态' if fmode == '1' else '采集'
|
|
|
+ fname = str(input(f'请输入文件名(q结束程序),留空默认为{ftips}:\n'))
|
|
|
+ t1 = time.time()
|
|
|
+ if fname == 'q':
|
|
|
+ exit('已主动结束脚本')
|
|
|
+ if not fmode or fmode == '0':
|
|
|
+ fname = fname or '采集'
|
|
|
+ main(fname)
|
|
|
+ elif fmode == '1':
|
|
|
+ fname = fname or '采集静态'
|
|
|
+ main_exclude(fname, 10)
|
|
|
+ else:
|
|
|
+ exit(f'未知的处理类型:{fmode}')
|
|
|
+ t2 = time.time()
|
|
|
+ print(f'本次程序运行耗时:{round(t2 - t1, 2)}秒')
|