首图2筛选.py 1.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. # File : 首图2筛选.py
  4. # Author: DaShenHan&道长-----先苦后甜,任凭晚风拂柳颜------
  5. # Date : 2022/9/30
  6. import re
  7. import requests
  8. from utils.htmlParser import jsoup
  9. headers = {'user-agent':'Mozilla/5.0 (Linux; Android 11; M2007J3SC Build/RKQ1.200826.002; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/77.0.3865.120 MQQBrowser/6.2 TBS/045714 Mobile Safari/537.36'}
  10. kv_list = {
  11. '按剧情':'vtype',
  12. '按地区':'varea',
  13. '按年份':'vyear',
  14. '按语言':'vlang',
  15. '按字母':'vword',
  16. }
  17. def getFilters(url):
  18. cate_id = str(re.search('.*/(\d+)', url).groups()[0])
  19. print(cate_id)
  20. jsp = jsoup(url)
  21. pdfh = jsp.pdfh
  22. pdfa = jsp.pdfa
  23. print(jsp)
  24. r = requests.get(url,headers=headers)
  25. r.encoding = r.apparent_encoding
  26. html = r.text
  27. cls_list = pdfa(html,'ul.stui-screen__list')
  28. print(len(cls_list))
  29. ft_dict = {cate_id:[]}
  30. for cls in cls_list:
  31. tt = pdfh(cls,'li&&Text')
  32. if tt.find('按类型')>-1:
  33. continue
  34. values = pdfa(cls,'ul&&a')
  35. # vl = [{"n":pdfh(i,'a&&Text'),"v":pdfh(i,'a&&href')} for i in values]
  36. vl = [{"n":pdfh(i,'a&&Text'),"v":re.search('(.*?)-(.*)',pdfh(i,'a&&href'),re.M|re.I|re.S).groups()[1].replace('.html','').replace('-','')} for i in values]
  37. ft_dict[cate_id].append({
  38. # 'key': kv_list[tt],
  39. 'key': tt.replace('按',''),
  40. 'name': tt,
  41. 'value': vl
  42. })
  43. print(ft_dict)
  44. return ft_dict
  45. if __name__ == '__main__':
  46. new_dict = {}
  47. for i in '1&2&3&4'.split('&'):
  48. ft_dict = getFilters(f'https://www.zbkk.net/vodshow/{i}--------2---.html')
  49. new_dict.update(ft_dict)
  50. print(new_dict)