Crawler_controller.py 1.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445
  1. from selenium import webdriver
  2. class url:
  3. num = 0#url处理器个数
  4. def __init__(self,dic=f'',dic_run=f'',add_func=lambda url:True,change_url=lambda url:url):
  5. url.num += 1
  6. if dic == '':dic = f'url[{url.num}].cot_url'
  7. if dic_run == '':dic = f'url_run[{url.num}].cot_url'
  8. self.dir = dic
  9. self.dir_run = dic_run
  10. self.file = open(dic,'a')#写入url_history的文件
  11. self.file_run = open(dic_run,'a')#写入已读url文件
  12. self.url_list = []#待读url
  13. self.add_func = add_func#url添加过滤方法
  14. self.change_url = change_url#url更正方法
  15. self.url_history = []#url历史
  16. def add_url(self,url):
  17. url = self.change_url(url)#url更正,比如http替换https
  18. if url not in self.url_history and self.add_func(url):#1.url不存在历史,2.url满足筛选条件
  19. self.url_list.append(url)
  20. self.url_history.append(url)
  21. self.__out_url(url)
  22. return True#写入成功
  23. return False#写入失败
  24. def get_url(self):
  25. url = self.url_list[0]
  26. self.__out_url_run(url)
  27. del self.url_list[0]
  28. return url
  29. def __out_url(self,url):#输出url
  30. self.file.write(f'{url}\n')
  31. self.file.flush()
  32. def __out_url_run(self,url):#输出url
  33. self.file_run.write(f'{url}\n')
  34. self.file_run.flush()
  35. def return_url(self):
  36. return self.url_list.copy()
  37. def return_url_history(self):
  38. return self.url_history.copy()