测试pdf.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. # File : 测试pdf.py
  4. # Author: DaShenHan&道长-----先苦后甜,任凭晚风拂柳颜------
  5. # Date : 2022/11/14
  6. from utils.ua import MOBILE_UA
  7. from utils.htmlParser import jsoup
  8. import requests
  9. from pyquery import PyQuery as pq
  10. def main():
  11. r = requests.get('http://m.ysxs8.vip',headers={
  12. 'User-Agent':MOBILE_UA
  13. })
  14. r.encoding = 'gb18030'
  15. html = r.text
  16. # print(html)
  17. jsp = jsoup(r.url)
  18. lis = jsp.pdfa(html,'.list-ul:eq(-1)')
  19. print(len(lis),lis)
  20. print(lis[0])
  21. a = jsp.pdfh(lis[0],'a&&li&&img&&alt')
  22. print(a)
  23. a = jsp.pdfh(lis[0], 'a&&li&&img&&data-original')
  24. print(a)
  25. a = jsp.pdfh(lis[0], 'a:eq(1)&&li&&Html')
  26. print(a)
  27. a = jsp.pdfh(lis[0], 'a:eq(1) li img')
  28. print(a)
  29. a = jsp.pd(lis[0], 'a:eq(1)&&li&&img&&src')
  30. print('src:',a)
  31. a = jsp.pd(lis[0], 'a&&href')
  32. print('href:', a)
  33. def main1():
  34. url = 'https://www.lanhua.tv/voddetail/7420.html'
  35. r = requests.get(url, headers={
  36. 'User-Agent': MOBILE_UA
  37. })
  38. # r.encoding = 'gb18030'
  39. html = r.text
  40. # print(html)
  41. jsp = jsoup(r.url)
  42. a = jsp.pdfh(html,'.content_min&&ul&&li:eq(2) a&&Text')
  43. print(a)
  44. a = jsp.pdfh(html, '.content_min&&ul&&li:eq(2)&&Text')
  45. print(a)
  46. def main2():
  47. url = 'http://www.tvyb03.com/vod/detail/id/117659.html'
  48. r = requests.get(url, headers={
  49. 'User-Agent': MOBILE_UA
  50. })
  51. html = r.text
  52. jsp = jsoup(r.url)
  53. a = jsp.pdfa(html, '.myui-panel__head h3')
  54. print(len(a))
  55. a = jsp.pdfa(html, '.myui-panel__head:eq(1) h3')
  56. print(len(a))
  57. a = jsp.pdfh(html,'h1&&Text')
  58. print(a)
  59. a = jsp.pdfh(html, 'h1')
  60. print(a)
  61. a = jsp.pdfa(html, 'h1')
  62. print(a)
  63. def main3():
  64. html = """
  65. <div>
  66. <p>内容1<span id='exd1'>我不获取的内容1</span><span id='exd2'>我不获取的内容2</span>内容2</p>
  67. </div>
  68. """
  69. jsp = jsoup('https://www.cnblogs.com/lizhibk/p/8623543.html')
  70. a = jsp.pdfh(html, 'div p:eq(0)--span&&Text')
  71. print(a)
  72. a = jsp.pdfh(html,'div p--span&&Text')
  73. print(a)
  74. a = jsp.pdfh(html, 'div p:eq(0)--#exd1&&Text')
  75. print(a)
  76. a = jsp.pdfh(html, 'div p:eq(0)--#exd2&&Text')
  77. print(a)
  78. a = jsp.pdfh(html, 'div p:eq(0)--#exd2--#exd1&&Text')
  79. print(a)
  80. # a = jsp.pdfh(html, 'div p--#exd1&&Text')
  81. a = jsp.pdfh(html, 'div p--#exd1')
  82. print(a)
  83. a = jsp.pdfh(html, 'div p:first--#exd1')
  84. print(a)
  85. html = requests.get('https://www.leyupro.com/lyd/139451.html').text
  86. a = jsp.pdfa(html,'.yunplay&&.downtitle&&ul li')
  87. print(a)
  88. def main4():
  89. a = '唐人街电影.html'
  90. a = '日常.html'
  91. with open(a,encoding='utf-8') as f:
  92. html = f.read()
  93. # print(html)
  94. 二级 = {"title": "h2&&Text;.content_detail.content_min.fl .data_style&&Text",
  95. "img": ".content_thumb .vodlist_thumb&&data-original",
  96. "desc": ".content_detail.content_min.fl li:eq(0)&&Text;.content_detail.content_min.fl li:eq(2)&&Text;.content_detail.content_min.fl li:eq(3)&&Text",
  97. "content": ".content&&Text", "tabs": ".play_source_tab:eq(0) a", "lists": ".content_playlist:eq(#id) li"}
  98. print(二级)
  99. jsp = jsoup('https://www.tangrenjie.tv/vod/detail/id/218945.html')
  100. # print(jsp.pdfa(html, 'h2'))
  101. # print('h2&&Text',jsp.pdfh(html, 'h2&&Text'))
  102. for i in 二级['title'].split(';'):
  103. print(i)
  104. print(jsp.pdfh(html,i))
  105. for i in 二级['desc'].split(';'):
  106. print(i)
  107. print(jsp.pdfh(html,i))
  108. for i in 二级['content'].split(';'):
  109. print(i)
  110. print(jsp.pdfh(html,i))
  111. for i in 二级['img'].split(';'):
  112. print(i)
  113. print(jsp.pd(html,i))
  114. print(jsp.pdfa(html,'.play_source_tab:eq(0) a'))
  115. print(jsp.pdfa(html,'#playlistbox&&.content_playlist:eq(1) li'))
  116. # doc = pq(html)
  117. # print(doc)
  118. # print('h2',doc.find('h2'))
  119. # print('h2',doc('.title'))
  120. # print('h2:',doc('h2'))
  121. if __name__ == '__main__':
  122. # main()
  123. # main1()
  124. # main2()
  125. main3()
  126. # main4()