Crawler_controller.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178
  1. from selenium import webdriver
  2. import threading
  3. import time
  4. from os.path import exists
  5. from os import mkdir
  6. import hashlib
  7. class url:#url管理器
  8. num = 0#url处理器个数
  9. def __init__(self,dic=f'',dic_run=f''):
  10. url.num += 1
  11. dic += f'/url[{url.num}].cot_url'
  12. dic_run += f'/url_run[{url.num}].cot_url'
  13. self.dir = dic
  14. self.dir_run = dic_run
  15. self.file = open(dic,'a')#写入url_history的文件
  16. self.file_run = open(dic_run,'a')#写入已读url文件
  17. self.url_list = []#待读url
  18. self.url_history = []#url历史
  19. self.filter = {}#过滤函数
  20. def filter_func(self,url):#url过滤系统
  21. for i in self.filter:
  22. if not self.filter[i](url): return False
  23. return True
  24. def Add_func(self,func,name):#添加过滤函数
  25. self.filter[name] = func
  26. def Del_func(self,index):#删除过滤函数
  27. del self.filter[list(self.filter.keys())[index]]
  28. def return_func(self):
  29. return list(self.filter.keys())
  30. def add_url(self,url):#添加url
  31. if url not in self.url_history and self.filter_func(url):#1.url不存在历史,2.url满足筛选条件
  32. self.url_list.append(url)#添加到待取得url
  33. self.url_history.append(url)#添加到历史url
  34. self.__out_url(url)#输出历史url
  35. return True#写入成功
  36. return False#写入失败
  37. def del_url(self,index):#删除url
  38. self.__out_url_run(f'DELETE {self.url_list[index]}')
  39. del self.url_list[index]
  40. def get_url(self):#取得url
  41. url = self.url_list[0]
  42. self.__out_url_run(url)
  43. del self.url_list[0]
  44. return url
  45. def __out_url(self,url):#输出url历史
  46. self.file.write(f'{url}\n')
  47. self.file.flush()
  48. def __out_url_run(self,url):#输出已经运行的url
  49. self.file_run.write(f'{url}\n')
  50. self.file_run.flush()
  51. def return_url(self):
  52. return self.url_list.copy()
  53. def return_url_history(self):
  54. return self.url_history.copy()
  55. class Page_Downloader:
  56. num = 0
  57. def __init__(self,url:url,dic=''):
  58. self.url = url
  59. self.dir = dic
  60. Page_Downloader.num += 1
  61. self.page_source_dict = {}#页面保存信息
  62. self.wait = {}#等待函数
  63. self.wait_list = []#等待函数的函数名字(执行顺序)
  64. self.cookie_Thread = None#子进程
  65. def Add_func(self,func,name):#添加等待函数
  66. name = f'[{len(self.wait)}]{name}'
  67. def f(*args,**kwargs):
  68. get = func(*args,**kwargs)
  69. print(get)
  70. try:
  71. if get[1] == '':raise Exception
  72. return get#save和name
  73. except:
  74. return False,''
  75. self.wait_list.append(name)
  76. self.wait[name] = f
  77. def Del_func(self,index):#删除等待函数
  78. del self.wait[list(self.wait.keys())[index]]
  79. def return_func(self):
  80. return list(self.wait.keys())
  81. def __seeting(self,*args):#设置参数,请求头
  82. options = webdriver.ChromeOptions()
  83. options.add_argument('disable-infobars')# 不显示提示语句
  84. for i in args:
  85. if i == '':continue
  86. options.add_argument(i)
  87. return options
  88. def strat_urlGet(self,*args):#用get请求url ->得到一个页面信息
  89. self.break_ = False
  90. self.page_source_dict = {}
  91. self.nowurl = self.url.get_url()#获取一个url
  92. url = self.nowurl
  93. self.browser = webdriver.Chrome(chrome_options=self.__seeting(*args))
  94. self.browser.get(url)
  95. return self.browser
  96. def Logical_operation(self,func_cookie=lambda x:None,func_page=lambda x:None):#执行等待策略
  97. browser = self.browser
  98. self.page_source_dict['FIRST_PAGE'] = browser.page_source#记录最先的PAGE源代码
  99. func_page(list(self.page_source_dict.keys()))
  100. self.break_ = True
  101. def update_cookie():
  102. nonlocal self
  103. while self.break_:
  104. try:
  105. func_cookie(self.browser.get_cookies()) # 与GUI通信显示cookie
  106. time.sleep(1)
  107. except:pass
  108. self.cookie_Thread = threading.Thread(target=update_cookie)
  109. self.cookie_Thread.start()
  110. for i in self.wait_list:
  111. save,name = self.wait[i](browser)
  112. if save:
  113. print(save)
  114. self.page_source_dict[name] = browser.page_source
  115. else:
  116. print(save)
  117. func_page(list(self.page_source_dict.keys()))
  118. self.page_source_dict['LAST_PAGE'] = browser.page_source#记录最后的PAGE源代码
  119. func_page(list(self.page_source_dict.keys()))
  120. def save_Page(self):#保存网页
  121. dic = self.dir + f'/Page_{hashlib.md5(self.nowurl.encode("utf8")).hexdigest()}'#通过计算哈希保存页面
  122. a = 0
  123. new_dir = ''
  124. while exists(dic):
  125. new_dir = dic + f'[{a}]'
  126. a += 1
  127. if new_dir == '':new_dir = dic
  128. mkdir(new_dir)
  129. with open(f'{new_dir}/url', 'w') as f:
  130. f.write(self.nowurl)
  131. for i in self.page_source_dict:
  132. with open(f'{new_dir}/{i}.html','w') as f:
  133. f.write(str(self.page_source_dict[i]))
  134. return None
  135. def Del_cookies(self,name):#删除指定cookies
  136. browser = self.browser
  137. browser.delete_cookie(name)
  138. def Tra_cookies(self):#清空cookies
  139. browser = self.browser
  140. browser.delete_all_cookies()
  141. def Add_cookies(self,cookies:dict):#清空cookies
  142. browser = self.browser
  143. browser.add_cookie(cookies)
  144. def update_cookies(self,name,cookies:dict,):
  145. browser = self.browser
  146. cookies_list = browser.get_cookies()
  147. for i in cookies_list:
  148. if i.get('name',None) == name:
  149. browser.delete_cookie(name)#删除原来cookies
  150. i.update(cookies)
  151. browser.add_cookie(i)
  152. return
  153. raise Exception