Crawler_controller.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435
  1. from selenium import webdriver
  2. import threading
  3. import time
  4. from os.path import exists
  5. from os import mkdir
  6. import hashlib
  7. from time import sleep
  8. class URL_PAGE():
  9. def __init__(self,url,func='get'):
  10. self.url = url
  11. self.func = func
  12. def __str__(self):
  13. return self.url
  14. class url:#url管理器
  15. num = 0#url处理器个数
  16. def __init__(self,dic=f'',dic_run=f''):
  17. url.num += 1
  18. dic += f'/url[{url.num}].cot_url'
  19. dic_run += f'/url_run[{url.num}].cot_url'
  20. self.dir = dic
  21. self.dir_run = dic_run
  22. self.file = open(dic,'a')#写入url_history的文件
  23. self.file_run = open(dic_run,'a')#写入已读url文件
  24. self.url_list = []#待读url
  25. self.url_history = []#url历史
  26. self.filter = {}#过滤函数
  27. def filter_func(self,url):#url过滤系统
  28. for i in self.filter:
  29. if not self.filter[i](url): return False
  30. return True
  31. def Add_func(self,func,name):#添加过滤函数
  32. self.filter[name] = func
  33. def Del_func(self,index):#删除过滤函数
  34. del self.filter[list(self.filter.keys())[index]]
  35. def return_func(self):
  36. return list(self.filter.keys())
  37. def add_url(self,url):#添加url
  38. if url not in self.url_history and self.filter_func(url):#1.url不存在历史,2.url满足筛选条件
  39. self.url_list.append(URL_PAGE(url,'get'))#添加到待取得url
  40. self.url_history.append(url)#添加到历史url
  41. self.__out_url(url)#输出历史url
  42. return True#写入成功
  43. return False#写入失败
  44. def del_url(self,index):#删除url
  45. self.__out_url_run(f'DELETE {self.url_list[index]}')
  46. del self.url_list[index]
  47. def get_url(self) -> URL_PAGE:#取得url
  48. url_page = self.url_list[0]
  49. self.__out_url_run(url_page.url)
  50. del self.url_list[0]
  51. return url_page
  52. def __out_url(self,url):#输出url历史
  53. self.file.write(f'{url}\n')
  54. self.file.flush()
  55. def __out_url_run(self,url):#输出已经运行的url
  56. self.file_run.write(f'{url}\n')
  57. self.file_run.flush()
  58. def return_url(self):
  59. return self.url_list.copy()
  60. def return_url_history(self):
  61. return self.url_history.copy()
  62. class Page_Downloader:
  63. num = 0
  64. def __init__(self,url:url,dic=''):
  65. self.url = url
  66. self.dir = dic
  67. Page_Downloader.num += 1
  68. self.page_source_dict = {}#页面保存信息
  69. self.cookie_Thread = None#子进程
  70. self.browser = None
  71. def __seeting(self,*args):#设置参数,请求头
  72. options = webdriver.ChromeOptions()
  73. options.add_argument('disable-infobars')# 不显示提示语句
  74. for i in args:
  75. if i == '':continue
  76. options.add_argument(i)
  77. return options
  78. def strat_urlGet(self,*args,func_cookie):#用get请求url ->得到一个页面信息
  79. self.break_ = False
  80. self.page_source_dict = {}
  81. self.nowurl = self.url.get_url()#获取一个url
  82. url = self.nowurl.url
  83. self.browser = webdriver.Chrome(chrome_options=self.__seeting(*args))
  84. self.browser.get(url)
  85. self.break_ = True
  86. def update_cookie():
  87. nonlocal self
  88. while self.break_:
  89. try:
  90. func_cookie(self.browser.get_cookies()) # 与GUI通信显示cookie
  91. time.sleep(.5)
  92. except:pass
  93. self.cookie_Thread = threading.Thread(target=update_cookie)
  94. self.cookie_Thread.start()
  95. self.Parser.browser = self.browser
  96. self.Parser.init()
  97. return self.browser
  98. def Del_cookies(self,name):#删除指定cookies
  99. browser = self.browser
  100. browser.delete_cookie(name)
  101. def Tra_cookies(self):#清空cookies
  102. browser = self.browser
  103. browser.delete_all_cookies()
  104. def Add_cookies(self,cookies:dict):#清空cookies
  105. browser = self.browser
  106. browser.add_cookie(cookies)
  107. def update_cookies(self,name,cookies:dict,):
  108. browser = self.browser
  109. cookies_list = browser.get_cookies()
  110. for i in cookies_list:
  111. if i.get('name',None) == name:
  112. browser.delete_cookie(name)#删除原来cookies
  113. i.update(cookies)
  114. browser.add_cookie(i)
  115. return
  116. raise Exception
  117. def set_Page_Parser(self,Parser):
  118. self.Parser = Parser
  119. self.Parser.browser = self.browser
  120. self.Parser.url = self.url
  121. class Page_Parser:
  122. def __init__(self,Downloader:Page_Downloader):
  123. self.Downloader = Downloader
  124. self.Downloader.set_Page_Parser(self)
  125. self.func_list = []
  126. self.func_dict = {}
  127. self.init()
  128. def init(self):
  129. self.element_dict = {}#记录属性的名字
  130. def add_base(self,func): # 装饰器
  131. def wrap(browser=None,num=None,name=None, *args, **kwargs):
  132. try:
  133. func(browser=browser,num=num, name=name, *args, **kwargs)
  134. return True
  135. except:
  136. return False
  137. return wrap
  138. def add_func(self,name,func):
  139. n = len(self.func_list)
  140. self.func_list.append(f'{name}[{n}]')
  141. self.func_dict[f'{name}[{n}]'] = func
  142. def return_func(self):
  143. return self.func_list.copy()
  144. def find_ID(self,id,not_all=False,**kwargs):
  145. @self.add_base
  146. def find(browser, num, name, *args, **kwargs):
  147. nonlocal self,id
  148. if browser == None:browser = self.browser
  149. if not_all:self.element_dict[f'{name}[{num}]'] = [browser.find_element_by_id(id)]#返回必须是list
  150. else:self.element_dict[f'{name}[{num}]'] = browser.find_elements_by_id(id)
  151. self.add_func(f'find_ID:{id}',find)#添加func
  152. def find_class(self,class_name,not_all=False,**kwargs):
  153. @self.add_base
  154. def find(browser, num, name, *args, **kwargs):
  155. nonlocal self,class_name
  156. if browser == None:browser = self.browser
  157. if not_all:self.element_dict[f'{name}[{num}]'] = [browser.find_element_by_class_name(class_name)]#返回必须是list
  158. else:self.element_dict[f'{name}[{num}]'] = browser.find_elements_by_class_name(class_name)#返回必须是list
  159. self.add_func(f'find_class:{class_name}',find)#添加func
  160. def find_name(self,name_,not_all=False,**kwargs):
  161. @self.add_base
  162. def find(browser, num, name, *args, **kwargs):
  163. nonlocal self,name_
  164. if browser == None:browser = self.browser
  165. if not_all:self.element_dict[f'{name}[{num}]'] = [browser.find_element_by_name(name_)]#返回必须是list
  166. else:self.element_dict[f'{name}[{num}]'] = browser.find_elements_by_name(name_)#返回必须是list
  167. self.add_func(f'find_name:{name_}',find)#添加func
  168. def find_xpath(self,xpath,not_all=False,**kwargs):
  169. @self.add_base
  170. def find(browser, num, name, *args, **kwargs):
  171. nonlocal self,xpath
  172. if browser == None:browser = self.browser
  173. if not_all:self.element_dict[f'{name}[{num}]'] = [browser.find_element_by_xpath(xpath)]#返回必须是list
  174. else:self.element_dict[f'{name}[{num}]'] = browser.find_elements_by_xpath(xpath)#返回必须是list
  175. self.add_func(f'find_xpath:{xpath}',find)#添加func
  176. def find_css(self,css_selector,not_all=False,**kwargs):
  177. @self.add_base
  178. def find(browser, num, name, *args, **kwargs):
  179. nonlocal self,css_selector
  180. if browser == None:browser = self.browser
  181. if not_all:self.element_dict[f'{name}[{num}]'] = [browser.find_element_by_css_selector(css_selector)]#返回必须是list
  182. else:self.element_dict[f'{name}[{num}]'] = browser.find_elements_by_css_selector(css_selector)#返回必须是list
  183. self.add_func(f'find_css:{css_selector}',find)#添加func
  184. def find_tag_name(self,tag_name,not_all=False,**kwargs):
  185. @self.add_base
  186. def find(browser, num, name, *args, **kwargs):
  187. nonlocal self,tag_name
  188. if browser == None:browser = self.browser
  189. if not_all:self.element_dict[f'{name}[{num}]'] = [browser.find_element_by_tag_name(tag_name)]#返回必须是list
  190. else:self.element_dict[f'{name}[{num}]'] = browser.find_elements_by_tag_name(tag_name)#返回必须是list
  191. self.add_func(f'find_tagName:{tag_name}',find)#添加func\
  192. def find_link_text(self,link_text,not_all=False,**kwargs):#匹配link
  193. @self.add_base
  194. def find(browser, num, name, *args, **kwargs):
  195. nonlocal self,link_text
  196. if browser == None:browser = self.browser
  197. if not_all:self.element_dict[f'{name}[{num}]'] = [browser.find_element_by_link_text(link_text)]#返回必须是list
  198. else:self.element_dict[f'{name}[{num}]'] = browser.find_elements_by_link_text(link_text)#返回必须是list
  199. self.add_func(f'find_link_text:{link_text}',find)#添加func
  200. def find_partial_link_text(self,partial_link_text,not_all=False,**kwargs):#模糊匹配
  201. @self.add_base
  202. def find(browser, num, name, *args, **kwargs):
  203. nonlocal self,partial_link_text
  204. if browser == None:browser = self.browser
  205. if not_all:self.element_dict[f'{name}[{num}]'] = [browser.find_element_by_partial_link_text(partial_link_text)]#返回必须是list
  206. else:self.element_dict[f'{name}[{num}]'] = [browser.find_element_by_partial_link_text(partial_link_text)]#返回必须是list
  207. self.add_func(f'find_partial_link_text:{partial_link_text}',find)#添加func
  208. def find_switch_to_alert(self,*args,**kwargs):#定位弹出框
  209. @self.add_base
  210. def find(browser, num, name, *args, **kwargs):
  211. nonlocal self
  212. if browser == None:browser = self.browser
  213. self.element_dict[f'{name}[{num}]'] = [browser.switch_to.alert()]
  214. self.add_func(f'find_alert',find)#添加func
  215. def find_switch_to_active_element(self,*args,**kwargs):#定位焦点元素
  216. @self.add_base
  217. def find(browser, num, name, *args, **kwargs):
  218. nonlocal self
  219. if browser == None:browser = self.browser
  220. self.element_dict[f'{name}[{num}]'] = [browser.switch_to.active_element()]
  221. self.add_func(f'active_element',find)#添加func
  222. def find_switch_to_frame(self,reference,is_id=False,*args,**kwargs):#定位Frame
  223. @self.add_base
  224. def find(browser, num, name, *args, **kwargs):
  225. nonlocal self,reference,is_id
  226. if browser == None:browser = self.browser
  227. if reference == None:
  228. self.element_dict[f'{name}[{num}]'] = [browser.default_content()]# 回到主文档
  229. elif reference == '':
  230. self.element_dict[f'{name}[{num}]'] = [browser.parent_frame()]# 回到父文档
  231. else:
  232. if is_id:reference = int(reference)
  233. self.element_dict[f'{name}[{num}]'] = [browser.switch_to.frame(str(reference))]# 定位进入文档
  234. func_name = {None:'主文档','':'父文档'}.get(reference,reference)
  235. self.add_func(f'find_frame:{func_name}',find)#添加func
  236. def send_keys(self,text,element_value,index=0,**kwargs):#输入文字
  237. @self.add_base
  238. def action(*args, **kwargs):
  239. nonlocal self
  240. self.element_dict[element_value][index].send_keys(text)
  241. self.add_func(f'sent_text:{text}>{element_value}[{index}]', action) # 添加func
  242. def User_Passwd(self,User,Passwd,element_value,index=0,**kwargs):#输入验证(User&Password)
  243. @self.add_base
  244. def action(*args, **kwargs):
  245. nonlocal self
  246. self.element_dict[element_value][index].authenticate(User,Passwd)
  247. self.add_func(f'User:Passwd:{User};{Passwd}>{element_value}[{index}]', action) # 添加func
  248. def clear(self,element_value,index=0,**kwargs):#清空文本
  249. @self.add_base
  250. def action(*args, **kwargs):
  251. nonlocal self
  252. self.element_dict[element_value][index].clear()
  253. self.add_func(f'clear_text>{element_value}[{index}]', action) # 添加func
  254. def click(self,element_value,index=0,**kwargs):#点击按钮
  255. @self.add_base
  256. def action(*args, **kwargs):
  257. nonlocal self
  258. self.element_dict[element_value][index].click()
  259. self.add_func(f'click>{element_value}[{index}]', action) # 添加func
  260. def accept(self,element_value,index=0,**kwargs):#点击确定(弹出框)
  261. @self.add_base
  262. def action(*args, **kwargs):
  263. nonlocal self
  264. self.element_dict[element_value][index].accept()
  265. self.add_func(f'accept>{element_value}[{index}]', action) # 添加func
  266. def dismiss(self,element_value,index=0,**kwargs):#点击取消(弹出框)
  267. @self.add_base
  268. def action(*args, **kwargs):
  269. nonlocal self
  270. self.element_dict[element_value][index].dismiss()
  271. self.add_func(f'dismiss>{element_value}[{index}]', action) # 添加func
  272. def submit(self,element_value,index=0,**kwargs):#提交表单
  273. @self.add_base
  274. def action(*args, **kwargs):
  275. nonlocal self
  276. self.element_dict[element_value][index].submit()
  277. self.add_func(f'submit>{element_value}[{index}]', action) # 添加func
  278. def deselect_by_index(self,element_value,deselect,index=0,**kwargs):#根据index取消选择
  279. @self.add_base
  280. def action(*args, **kwargs):
  281. nonlocal self
  282. self.element_dict[element_value][index].deselect_by_index(int(deselect))
  283. self.add_func(f'deselect_by_index:{deselect}>{element_value}[{index}]', action) # 添加func
  284. def deselect_by_text(self,element_value,deselect,index=0,**kwargs):#根据text取消选择
  285. @self.add_base
  286. def action(*args, **kwargs):
  287. nonlocal self
  288. self.element_dict[element_value][index].deselect_by_visible_text(deselect)
  289. self.add_func(f'deselect_by_text:{deselect}>{element_value}[{index}]', action) # 添加func
  290. def deselect_by_value(self,element_value,deselect,index=0,**kwargs):#根据value取消选择
  291. @self.add_base
  292. def action(*args, **kwargs):
  293. nonlocal self
  294. self.element_dict[element_value][index].deselect_by_value(deselect)
  295. self.add_func(f'deselect_by_value:{deselect}>{element_value}[{index}]', action) # 添加func
  296. def select_by_index(self,element_value,deselect,index=0,**kwargs):#根据index选择
  297. @self.add_base
  298. def action(*args, **kwargs):
  299. nonlocal self
  300. self.element_dict[element_value][index].select_by_index(int(deselect))
  301. self.add_func(f'select_by_index:{deselect}>{element_value}[{index}]', action) # 添加func
  302. def select_by_text(self,element_value,deselect,index=0,**kwargs):#根据text选择
  303. @self.add_base
  304. def action(*args, **kwargs):
  305. nonlocal self
  306. self.element_dict[element_value][index].select_by_visible_text(deselect)
  307. self.add_func(f'select_by_text:{deselect}>{element_value}[{index}]', action) # 添加func
  308. def select_by_value(self,element_value,deselect,index=0,**kwargs):#根据value选择
  309. @self.add_base
  310. def action(*args, **kwargs):
  311. nonlocal self
  312. self.element_dict[element_value][index].select_by_value(deselect)
  313. self.add_func(f'select_by_value:{deselect}>{element_value}[{index}]', action) # 添加func
  314. def back(self,**kwargs):# 返回
  315. @self.add_base
  316. def action(*args, **kwargs):
  317. nonlocal self
  318. self.browser.back()
  319. self.add_func(f'BACK', action)
  320. def forward(self,**kwargs):# 前进
  321. @self.add_base
  322. def action(*args, **kwargs):
  323. nonlocal self
  324. self.browser.forward()
  325. self.add_func(f'FORWARD', action)
  326. def refresh(self,**kwargs):# 刷新
  327. @self.add_base
  328. def action(*args, **kwargs):
  329. nonlocal self
  330. self.browser.refresh()
  331. self.add_func(f'REFRESH', action)
  332. def wait_sleep(self,time:int=2,**kwargs):#暴力等待
  333. @self.add_base
  334. def action(*args, **kwargs):
  335. nonlocal self
  336. sleep(time)
  337. self.add_func(f'WAIT:{time}s', action)
  338. def set_wait(self,time:int=2,**kwargs):#隐式等待
  339. @self.add_base
  340. def action(*args, **kwargs):
  341. nonlocal self
  342. sleep(time)
  343. self.add_func(f'Loading_wait:{time}s', action)
  344. def run_JS(self,JS,**kwargs):
  345. @self.add_base
  346. def action(num,name,*args, **kwargs):
  347. nonlocal self
  348. get = self.browser.execute_script(JS)
  349. if hasattr(get,'__getitem__'):#可切片
  350. self.element_dict[f'{name}[{num}]'] = get # 返回必须是list
  351. else:
  352. self.element_dict[f'{name}[{num}]'] = [get]
  353. self.add_func(f'run_js:{JS}', action)
  354. def Element_interaction(self,update_func=lambda *args:None):#元素交互
  355. func_list = self.func_list
  356. status = None
  357. def update(func_name):
  358. nonlocal status,self
  359. if status:
  360. success_code = 'Success to run'
  361. elif status is None:
  362. success_code = 'No status'
  363. else:
  364. success_code = 'Wrong to run'
  365. value_box = []
  366. for i in self.element_dict:
  367. try:
  368. value_box.append(f'{i}[{len(i)}] = {self.element_dict[i]}')
  369. except:
  370. value_box.append(f'{i} = {self.element_dict[i]}')
  371. update_func(func_name, success_code, value_box) # 信息更新系统
  372. for func_num in range(len(func_list)):
  373. func_name = func_list[func_num]
  374. update(func_name)
  375. status = self.func_dict[func_name](num=f'{func_num}',name='var')
  376. update('Finish')