Crawler_controller.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354
  1. from selenium import webdriver
  2. import threading
  3. import time
  4. from os.path import exists
  5. from os import mkdir
  6. import hashlib
  7. class URL_PAGE():
  8. def __init__(self,url,func='get'):
  9. self.url = url
  10. self.func = func
  11. def __str__(self):
  12. return self.url
  13. class url:#url管理器
  14. num = 0#url处理器个数
  15. def __init__(self,dic=f'',dic_run=f''):
  16. url.num += 1
  17. dic += f'/url[{url.num}].cot_url'
  18. dic_run += f'/url_run[{url.num}].cot_url'
  19. self.dir = dic
  20. self.dir_run = dic_run
  21. self.file = open(dic,'a')#写入url_history的文件
  22. self.file_run = open(dic_run,'a')#写入已读url文件
  23. self.url_list = []#待读url
  24. self.url_history = []#url历史
  25. self.filter = {}#过滤函数
  26. def filter_func(self,url):#url过滤系统
  27. for i in self.filter:
  28. if not self.filter[i](url): return False
  29. return True
  30. def Add_func(self,func,name):#添加过滤函数
  31. self.filter[name] = func
  32. def Del_func(self,index):#删除过滤函数
  33. del self.filter[list(self.filter.keys())[index]]
  34. def return_func(self):
  35. return list(self.filter.keys())
  36. def add_url(self,url):#添加url
  37. if url not in self.url_history and self.filter_func(url):#1.url不存在历史,2.url满足筛选条件
  38. self.url_list.append(URL_PAGE(url,'get'))#添加到待取得url
  39. self.url_history.append(url)#添加到历史url
  40. self.__out_url(url)#输出历史url
  41. return True#写入成功
  42. return False#写入失败
  43. def del_url(self,index):#删除url
  44. self.__out_url_run(f'DELETE {self.url_list[index]}')
  45. del self.url_list[index]
  46. def get_url(self) -> URL_PAGE:#取得url
  47. url_page = self.url_list[0]
  48. self.__out_url_run(url_page.url)
  49. del self.url_list[0]
  50. return url_page
  51. def __out_url(self,url):#输出url历史
  52. self.file.write(f'{url}\n')
  53. self.file.flush()
  54. def __out_url_run(self,url):#输出已经运行的url
  55. self.file_run.write(f'{url}\n')
  56. self.file_run.flush()
  57. def return_url(self):
  58. return self.url_list.copy()
  59. def return_url_history(self):
  60. return self.url_history.copy()
  61. class Page_Downloader:
  62. num = 0
  63. def __init__(self,url:url,dic=''):
  64. self.url = url
  65. self.dir = dic
  66. Page_Downloader.num += 1
  67. self.page_source_dict = {}#页面保存信息
  68. self.cookie_Thread = None#子进程
  69. self.browser = None
  70. def __seeting(self,*args):#设置参数,请求头
  71. options = webdriver.ChromeOptions()
  72. options.add_argument('disable-infobars')# 不显示提示语句
  73. for i in args:
  74. if i == '':continue
  75. options.add_argument(i)
  76. return options
  77. def strat_urlGet(self,*args,func_cookie):#用get请求url ->得到一个页面信息
  78. self.break_ = False
  79. self.page_source_dict = {}
  80. self.nowurl = self.url.get_url()#获取一个url
  81. url = self.nowurl.url
  82. self.browser = webdriver.Chrome(chrome_options=self.__seeting(*args))
  83. self.browser.get(url)
  84. self.break_ = True
  85. def update_cookie():
  86. nonlocal self
  87. while self.break_:
  88. try:
  89. func_cookie(self.browser.get_cookies()) # 与GUI通信显示cookie
  90. time.sleep(.5)
  91. except:pass
  92. self.cookie_Thread = threading.Thread(target=update_cookie)
  93. self.cookie_Thread.start()
  94. self.Parser.browser = self.browser
  95. self.Parser.init()
  96. return self.browser
  97. def Del_cookies(self,name):#删除指定cookies
  98. browser = self.browser
  99. browser.delete_cookie(name)
  100. def Tra_cookies(self):#清空cookies
  101. browser = self.browser
  102. browser.delete_all_cookies()
  103. def Add_cookies(self,cookies:dict):#清空cookies
  104. browser = self.browser
  105. browser.add_cookie(cookies)
  106. def update_cookies(self,name,cookies:dict,):
  107. browser = self.browser
  108. cookies_list = browser.get_cookies()
  109. for i in cookies_list:
  110. if i.get('name',None) == name:
  111. browser.delete_cookie(name)#删除原来cookies
  112. i.update(cookies)
  113. browser.add_cookie(i)
  114. return
  115. raise Exception
  116. def set_Page_Parser(self,Parser):
  117. self.Parser = Parser
  118. self.Parser.browser = self.browser
  119. self.Parser.url = self.url
  120. class Page_Parser:
  121. def __init__(self,Downloader:Page_Downloader):
  122. self.Downloader = Downloader
  123. self.Downloader.set_Page_Parser(self)
  124. self.func_list = []
  125. self.func_dict = {}
  126. self.init()
  127. def init(self):
  128. self.element_dict = {}#记录属性的名字
  129. def add_base(self,func): # 装饰器
  130. def wrap(browser=None,num=None,name=None, *args, **kwargs):
  131. try:
  132. func(browser,num, name, *args, **kwargs)
  133. return False
  134. except:
  135. return True
  136. return wrap
  137. def add_func(self,name,func):
  138. n = len(self.func_list)
  139. self.func_list.append(f'{name}[{n}]')
  140. self.func_dict[f'{name}[{n}]'] = func
  141. def return_func(self):
  142. return self.func_list.copy()
  143. def find_ID(self,id,not_all=False):
  144. @self.add_base
  145. def find(browser, num, name, *args, **kwargs):
  146. nonlocal self,id
  147. if browser == None:browser = self.browser
  148. if not_all:self.element_dict[f'{name}[{num}]'] = [browser.find_element_by_id(id)]#返回必须是list
  149. else:self.element_dict[f'{name}[{num}]'] = browser.find_elements_by_id(id)
  150. self.add_func(f'find_ID:{id}',find)#添加func
  151. def find_class(self,class_name,not_all=False):
  152. @self.add_base
  153. def find(browser, num, name, *args, **kwargs):
  154. nonlocal self,class_name
  155. if browser == None:browser = self.browser
  156. if not_all:self.element_dict[f'{name}[{num}]'] = [browser.find_element_by_class_name(class_name)]#返回必须是list
  157. else:self.element_dict[f'{name}[{num}]'] = browser.find_elements_by_class_name(class_name)#返回必须是list
  158. self.add_func(f'find_class:{class_name}',find)#添加func
  159. def find_name(self,name_,not_all=False):
  160. @self.add_base
  161. def find(browser, num, name, *args, **kwargs):
  162. nonlocal self,name_
  163. if browser == None:browser = self.browser
  164. if not_all:self.element_dict[f'{name}[{num}]'] = [browser.find_element_by_name(name_)]#返回必须是list
  165. else:self.element_dict[f'{name}[{num}]'] = browser.find_elements_by_name(name_)#返回必须是list
  166. self.add_func(f'find_name:{name_}',find)#添加func
  167. def find_xpath(self,xpath,not_all=False):
  168. @self.add_base
  169. def find(browser, num, name, *args, **kwargs):
  170. nonlocal self,xpath
  171. if browser == None:browser = self.browser
  172. if not_all:self.element_dict[f'{name}[{num}]'] = [browser.find_element_by_xpath(xpath)]#返回必须是list
  173. else:self.element_dict[f'{name}[{num}]'] = browser.find_elements_by_xpath(xpath)#返回必须是list
  174. self.add_func(f'find_xpath:{xpath}',find)#添加func
  175. def find_css(self,css_selector,not_all=False):
  176. @self.add_base
  177. def find(browser, num, name, *args, **kwargs):
  178. nonlocal self,css_selector
  179. if browser == None:browser = self.browser
  180. if not_all:self.element_dict[f'{name}[{num}]'] = [browser.find_element_by_css_selector(css_selector)]#返回必须是list
  181. else:self.element_dict[f'{name}[{num}]'] = browser.find_elements_by_css_selector(css_selector)#返回必须是list
  182. self.add_func(f'find_css:{css_selector}',find)#添加func
  183. def find_tag_name(self,tag_name,not_all=False):
  184. @self.add_base
  185. def find(browser, num, name, *args, **kwargs):
  186. nonlocal self,tag_name
  187. if browser == None:browser = self.browser
  188. if not_all:self.element_dict[f'{name}[{num}]'] = [browser.find_element_by_tag_name(tag_name)]#返回必须是list
  189. else:self.element_dict[f'{name}[{num}]'] = browser.find_elements_by_tag_name(tag_name)#返回必须是list
  190. self.add_func(f'find_tagName:{tag_name}',find)#添加func\
  191. def find_link_text(self,link_text,not_all=False):#匹配link
  192. @self.add_base
  193. def find(browser, num, name, *args, **kwargs):
  194. nonlocal self,link_text
  195. if browser == None:browser = self.browser
  196. if not_all:self.element_dict[f'{name}[{num}]'] = [browser.find_element_by_link_text(link_text)]#返回必须是list
  197. else:self.element_dict[f'{name}[{num}]'] = browser.find_elements_by_link_text(link_text)#返回必须是list
  198. self.add_func(f'find_link_text:{link_text}',find)#添加func
  199. def find_partial_link_text(self,partial_link_text,not_all=False):#模糊匹配
  200. @self.add_base
  201. def find(browser, num, name, *args, **kwargs):
  202. nonlocal self,partial_link_text
  203. if browser == None:browser = self.browser
  204. if not_all:self.element_dict[f'{name}[{num}]'] = [browser.find_element_by_partial_link_text(partial_link_text)]#返回必须是list
  205. else:self.element_dict[f'{name}[{num}]'] = [browser.find_element_by_partial_link_text(partial_link_text)]#返回必须是list
  206. self.add_func(f'find_partial_link_text:{partial_link_text}',find)#添加func
  207. def find_switch_to_alert(self,*args,**kwargs):#定位弹出框
  208. @self.add_base
  209. def find(browser, num, name, *args, **kwargs):
  210. nonlocal self
  211. if browser == None:browser = self.browser
  212. self.element_dict[f'{name}[{num}]'] = [browser.switch_to.alert()]
  213. self.add_func(f'find_alert',find)#添加func
  214. def find_switch_to_active_element(self,*args,**kwargs):#定位焦点元素
  215. @self.add_base
  216. def find(browser, num, name, *args, **kwargs):
  217. nonlocal self
  218. if browser == None:browser = self.browser
  219. self.element_dict[f'{name}[{num}]'] = [browser.switch_to.active_element()]
  220. self.add_func(f'active_element',find)#添加func
  221. def find_switch_to_frame(self,reference,is_id=False,*args,**kwargs):#定位弹出框
  222. @self.add_base
  223. def find(browser, num, name, *args, **kwargs):
  224. nonlocal self,reference,is_id
  225. if browser == None:browser = self.browser
  226. if reference == None:
  227. self.element_dict[f'{name}[{num}]'] = [browser.default_content()]# 回到主文档
  228. elif reference == '':
  229. self.element_dict[f'{name}[{num}]'] = [browser.parent_frame()]# 回到父文档
  230. else:
  231. if is_id:reference = int(reference)
  232. self.element_dict[f'{name}[{num}]'] = [browser.switch_to.frame(str(reference))]# 定位进入文档
  233. self.add_func(f'find_frame:{reference}',find)#添加func
  234. def send_keys(self,text,element_value,index=0):#输入文字
  235. @self.add_base
  236. def action(*args, **kwargs):
  237. nonlocal self
  238. self.element_dict[element_value][index].send_keys(text)
  239. self.add_func(f'sent_text:{text}>{element_value}[{index}]', action) # 添加func
  240. def User_Passwd(self,User,Passwd,element_value,index=0):#输入验证(User&Password)
  241. @self.add_base
  242. def action(*args, **kwargs):
  243. nonlocal self
  244. self.element_dict[element_value][index].authenticate(User,Passwd)
  245. self.add_func(f'sent_text:{User};{Passwd}>{element_value}[{index}]', action) # 添加func
  246. def clear(self,element_value,index=0):#清空文本
  247. @self.add_base
  248. def action(*args, **kwargs):
  249. nonlocal self
  250. self.element_dict[element_value][index].clear()
  251. self.add_func(f'clear_text>{element_value}[{index}]', action) # 添加func
  252. def click(self,element_value,index=0):#点击按钮
  253. @self.add_base
  254. def action(*args, **kwargs):
  255. nonlocal self
  256. self.element_dict[element_value][index].click()
  257. self.add_func(f'click>{element_value}[{index}]', action) # 添加func
  258. def accept(self,element_value,index=0):#点击确定(弹出框)
  259. @self.add_base
  260. def action(*args, **kwargs):
  261. nonlocal self
  262. self.element_dict[element_value][index].accept()
  263. self.add_func(f'accept>{element_value}[{index}]', action) # 添加func
  264. def dismiss(self,element_value,index=0):#点击取消(弹出框)
  265. @self.add_base
  266. def action(*args, **kwargs):
  267. nonlocal self
  268. self.element_dict[element_value][index].dismiss()
  269. self.add_func(f'dismiss>{element_value}[{index}]', action) # 添加func
  270. def submit(self,element_value,index=0):#点击按钮
  271. @self.add_base
  272. def action(*args, **kwargs):
  273. nonlocal self
  274. self.element_dict[element_value][index].submit()
  275. self.add_func(f'submit>{element_value}[{index}]', action) # 添加func
  276. def deselect_by_index(self,element_value,deselect_index,index=0):#根据index取消选择
  277. @self.add_base
  278. def action(*args, **kwargs):
  279. nonlocal self
  280. self.element_dict[element_value][index].deselect_by_index(int(deselect_index))
  281. self.add_func(f'deselect_by_index:{deselect_index}>{element_value}[{index}]', action) # 添加func
  282. def deselect_by_text(self,element_value,deselect_text,index=0):#根据text取消选择
  283. @self.add_base
  284. def action(*args, **kwargs):
  285. nonlocal self
  286. self.element_dict[element_value][index].deselect_by_visible_text(deselect_text)
  287. self.add_func(f'deselect_by_text:{deselect_text}>{element_value}[{index}]', action) # 添加func
  288. def select_by_index(self,element_value,deselect_index,index=0):#根据index选择
  289. @self.add_base
  290. def action(*args, **kwargs):
  291. nonlocal self
  292. self.element_dict[element_value][index].select_by_index(int(deselect_index))
  293. self.add_func(f'select_by_index:{deselect_index}>{element_value}[{index}]', action) # 添加func
  294. def select_by_text(self,element_value,deselect_text,index=0):#根据text选择
  295. @self.add_base
  296. def action(*args, **kwargs):
  297. nonlocal self
  298. self.element_dict[element_value][index].select_by_visible_text(deselect_text)
  299. self.add_func(f'select_by_text:{deselect_text}>{element_value}[{index}]', action) # 添加func
  300. def Element_interaction(self):#元素交互
  301. func_list = self.func_list
  302. for func_num in range(len(func_list)):
  303. self.func_dict[func_list[func_num]](num=f'{func_num}',name='var')