Crawler_controller.py 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671
  1. from selenium import webdriver
  2. import threading
  3. import time
  4. from os.path import exists
  5. from os import mkdir
  6. import hashlib
  7. from time import sleep
  8. import bs4
  9. import re as regular
  10. import Information_storage
  11. data_base = Information_storage.DataBase_Home()
  12. class URL_PAGE():
  13. def __init__(self,url,func='get'):
  14. self.url = url
  15. self.func = func
  16. def __str__(self):
  17. return self.url
  18. class url:#url管理器
  19. num = 0#url处理器个数
  20. def __init__(self,dic=f'',dic_run=f''):
  21. url.num += 1
  22. dic += f'/url[{url.num}].cot_url'
  23. dic_run += f'/url_run[{url.num}].cot_url'
  24. self.dir = dic
  25. self.dir_run = dic_run
  26. self.file = open(dic,'a')#写入url_history的文件
  27. self.file_run = open(dic_run,'a')#写入已读url文件
  28. self.url_list = []#待读url
  29. self.url_history = []#url历史
  30. self.filter = {}#过滤函数
  31. def filter_func(self,url):#url过滤系统
  32. for i in self.filter:
  33. if not self.filter[i](url): return False
  34. return True
  35. def Add_func(self,func,name):#添加过滤函数
  36. self.filter[name] = func
  37. def Del_func(self,index):#删除过滤函数
  38. del self.filter[list(self.filter.keys())[index]]
  39. def return_func(self):
  40. return list(self.filter.keys())
  41. def add_url(self,url):#添加url
  42. if url not in self.url_history and self.filter_func(url):#1.url不存在历史,2.url满足筛选条件
  43. self.url_list.append(URL_PAGE(url,'get'))#添加到待取得url
  44. self.url_history.append(url)#添加到历史url
  45. self.__out_url(url)#输出历史url
  46. return True#写入成功
  47. return False#写入失败
  48. def del_url(self,index):#删除url
  49. self.__out_url_run(f'DELETE {self.url_list[index]}')
  50. del self.url_list[index]
  51. def get_url(self) -> URL_PAGE:#取得url
  52. url_page = self.url_list[0]
  53. self.__out_url_run(url_page.url)
  54. del self.url_list[0]
  55. return url_page
  56. def __out_url(self,url):#输出url历史
  57. self.file.write(f'{url}\n')
  58. self.file.flush()
  59. def __out_url_run(self,url):#输出已经运行的url
  60. self.file_run.write(f'{url}\n')
  61. self.file_run.flush()
  62. def return_url(self):
  63. return self.url_list.copy()
  64. def return_url_history(self):
  65. return self.url_history.copy()
  66. class Page_Downloader:
  67. num = 0
  68. def __init__(self,url:url,dic=''):
  69. self.url = url
  70. self.dir = dic
  71. Page_Downloader.num += 1
  72. self.page_source_dict = {}#页面保存信息
  73. self.cookie_Thread = None#子进程
  74. self.browser = None
  75. def __seeting(self,*args):#设置参数,请求头
  76. options = webdriver.ChromeOptions()
  77. options.add_argument('disable-infobars')# 不显示提示语句
  78. for i in args:
  79. if i == '':continue
  80. options.add_argument(i)
  81. return options
  82. def strat_urlGet(self,*args,func_cookie):#用get请求url ->得到一个页面信息
  83. self.break_ = False
  84. self.page_source_dict = {}
  85. self.nowurl = self.url.get_url()#获取一个url
  86. url = self.nowurl.url
  87. self.browser = webdriver.Chrome(chrome_options=self.__seeting(*args))
  88. self.browser.get(url)
  89. self.break_ = True
  90. def update_cookie():
  91. nonlocal self
  92. while self.break_:
  93. try:
  94. func_cookie(self.browser.get_cookies()) # 与GUI通信显示cookie
  95. time.sleep(.5)
  96. except:pass
  97. self.cookie_Thread = threading.Thread(target=update_cookie)
  98. self.cookie_Thread.start()
  99. self.Parser.browser = self.browser
  100. self.Parser.init(url)
  101. return self.browser
  102. def Del_cookies(self,name):#删除指定cookies
  103. browser = self.browser
  104. browser.delete_cookie(name)
  105. def Tra_cookies(self):#清空cookies
  106. browser = self.browser
  107. browser.delete_all_cookies()
  108. def Add_cookies(self,cookies:dict):#清空cookies
  109. browser = self.browser
  110. browser.add_cookie(cookies)
  111. def update_cookies(self,name,cookies:dict,):
  112. browser = self.browser
  113. cookies_list = browser.get_cookies()
  114. for i in cookies_list:
  115. if i.get('name',None) == name:
  116. browser.delete_cookie(name)#删除原来cookies
  117. i.update(cookies)
  118. browser.add_cookie(i)
  119. return
  120. raise Exception
  121. def set_Page_Parser(self,Parser):
  122. self.Parser = Parser
  123. self.Parser.browser = self.browser
  124. self.Parser.url = self.url
  125. self.Parser.dir = self.dir
  126. class Page_Parser:
  127. def __init__(self,Downloader:Page_Downloader):
  128. self.Downloader = Downloader
  129. self.Downloader.set_Page_Parser(self)
  130. self.func_list = []
  131. self.func_dict = {}
  132. self.init()
  133. def init(self,url=''):
  134. self.element_dict = {}#记录属性的名字
  135. self.now_url = url
  136. def add_base(self,func): # 装饰器
  137. def wrap(browser=None,num=None,name=None, *args, **kwargs) -> bool:
  138. try:
  139. func(browser=browser,num=num, name=name, *args, **kwargs)
  140. return True
  141. except:
  142. return False
  143. return wrap
  144. def add_func(self,name,func):
  145. n = len(self.func_list)
  146. self.func_list.append(f'{name}[{n}]')
  147. self.func_dict[f'{name}[{n}]'] = func
  148. def return_func(self,only=True):
  149. if only:
  150. return self.func_list.copy()
  151. else:
  152. return [f'var[{index}]@ {i}' for index,i in enumerate(self.func_list.copy())]
  153. def find_ID(self,id,not_all=False,**kwargs):
  154. @self.add_base
  155. def find(browser, num, name, *args, **kwargs):
  156. nonlocal self,id
  157. if browser == None:browser = self.browser
  158. if not_all:self.element_dict[f'{name}[{num}]'] = [browser.find_element_by_id(id)]#返回必须是list
  159. else:self.element_dict[f'{name}[{num}]'] = browser.find_elements_by_id(id)
  160. self.add_func(f'find_ID:{id}',find)#添加func
  161. def find_class(self,class_name,not_all=False,**kwargs):
  162. @self.add_base
  163. def find(browser, num, name, *args, **kwargs):
  164. nonlocal self,class_name
  165. if browser == None:browser = self.browser
  166. if not_all:self.element_dict[f'{name}[{num}]'] = [browser.find_element_by_class_name(class_name)]#返回必须是list
  167. else:self.element_dict[f'{name}[{num}]'] = browser.find_elements_by_class_name(class_name)#返回必须是list
  168. self.add_func(f'find_class:{class_name}',find)#添加func
  169. def find_name(self,name_,not_all=False,**kwargs):
  170. @self.add_base
  171. def find(browser, num, name, *args, **kwargs):
  172. nonlocal self,name_
  173. if browser == None:browser = self.browser
  174. if not_all:self.element_dict[f'{name}[{num}]'] = [browser.find_element_by_name(name_)]#返回必须是list
  175. else:self.element_dict[f'{name}[{num}]'] = browser.find_elements_by_name(name_)#返回必须是list
  176. self.add_func(f'find_name:{name_}',find)#添加func
  177. def find_xpath(self,xpath,not_all=False,**kwargs):
  178. @self.add_base
  179. def find(browser, num, name, *args, **kwargs):
  180. nonlocal self,xpath
  181. if browser == None:browser = self.browser
  182. if not_all:self.element_dict[f'{name}[{num}]'] = [browser.find_element_by_xpath(xpath)]#返回必须是list
  183. else:self.element_dict[f'{name}[{num}]'] = browser.find_elements_by_xpath(xpath)#返回必须是list
  184. self.add_func(f'find_xpath:{xpath}',find)#添加func
  185. def find_css(self,css_selector,not_all=False,**kwargs):
  186. @self.add_base
  187. def find(browser, num, name, *args, **kwargs):
  188. nonlocal self,css_selector
  189. if browser == None:browser = self.browser
  190. if not_all:self.element_dict[f'{name}[{num}]'] = [browser.find_element_by_css_selector(css_selector)]#返回必须是list
  191. else:self.element_dict[f'{name}[{num}]'] = browser.find_elements_by_css_selector(css_selector)#返回必须是list
  192. self.add_func(f'find_css:{css_selector}',find)#添加func
  193. def find_tag_name(self,tag_name,not_all=False,**kwargs):
  194. @self.add_base
  195. def find(browser, num, name, *args, **kwargs):
  196. nonlocal self,tag_name
  197. if browser == None:browser = self.browser
  198. if not_all:self.element_dict[f'{name}[{num}]'] = [browser.find_element_by_tag_name(tag_name)]#返回必须是list
  199. else:self.element_dict[f'{name}[{num}]'] = browser.find_elements_by_tag_name(tag_name)#返回必须是list
  200. self.add_func(f'find_tagName:{tag_name}',find)#添加func\
  201. def find_link_text(self,link_text,not_all=False,**kwargs):#匹配link
  202. @self.add_base
  203. def find(browser, num, name, *args, **kwargs):
  204. nonlocal self,link_text
  205. if browser == None:browser = self.browser
  206. if not_all:self.element_dict[f'{name}[{num}]'] = [browser.find_element_by_link_text(link_text)]#返回必须是list
  207. else:self.element_dict[f'{name}[{num}]'] = browser.find_elements_by_link_text(link_text)#返回必须是list
  208. self.add_func(f'find_link_text:{link_text}',find)#添加func
  209. def find_partial_link_text(self,partial_link_text,not_all=False,**kwargs):#模糊匹配
  210. @self.add_base
  211. def find(browser, num, name, *args, **kwargs):
  212. nonlocal self,partial_link_text
  213. if browser == None:browser = self.browser
  214. if not_all:self.element_dict[f'{name}[{num}]'] = [browser.find_element_by_partial_link_text(partial_link_text)]#返回必须是list
  215. else:self.element_dict[f'{name}[{num}]'] = [browser.find_element_by_partial_link_text(partial_link_text)]#返回必须是list
  216. self.add_func(f'find_partial_link_text:{partial_link_text}',find)#添加func
  217. def find_switch_to_alert(self,*args,**kwargs):#定位弹出框
  218. @self.add_base
  219. def find(browser, num, name, *args, **kwargs):
  220. nonlocal self
  221. if browser == None:browser = self.browser
  222. self.element_dict[f'{name}[{num}]'] = [browser.switch_to.alert()]
  223. self.add_func(f'find_alert',find)#添加func
  224. def find_switch_to_active_element(self,*args,**kwargs):#定位焦点元素
  225. @self.add_base
  226. def find(browser, num, name, *args, **kwargs):
  227. nonlocal self
  228. if browser == None:browser = self.browser
  229. self.element_dict[f'{name}[{num}]'] = [browser.switch_to.active_element()]
  230. self.add_func(f'active_element',find)#添加func
  231. def find_switch_to_frame(self,reference,is_id=False,*args,**kwargs):#定位Frame
  232. @self.add_base
  233. def find(browser, num, name, *args, **kwargs):
  234. nonlocal self,reference,is_id
  235. if browser == None:browser = self.browser
  236. if reference == None:
  237. self.element_dict[f'{name}[{num}]'] = [browser.default_content()]# 回到主文档
  238. elif reference == '':
  239. self.element_dict[f'{name}[{num}]'] = [browser.parent_frame()]# 回到父文档
  240. else:
  241. if is_id:reference = int(reference)
  242. self.element_dict[f'{name}[{num}]'] = [browser.switch_to.frame(str(reference))]# 定位进入文档
  243. func_name = {None:'主文档','':'父文档'}.get(reference,reference)
  244. self.add_func(f'find_frame:{func_name}',find)#添加func
  245. def send_keys(self,text,element_value,index=0,**kwargs):#输入文字
  246. @self.add_base
  247. def action(*args, **kwargs):
  248. nonlocal self
  249. self.element_dict[element_value][index].send_keys(text)
  250. self.add_func(f'sent_text:{text}>{element_value}[{index}]', action) # 添加func
  251. def User_Passwd(self,User,Passwd,element_value,index=0,**kwargs):#输入验证(User&Password)
  252. @self.add_base
  253. def action(*args, **kwargs):
  254. nonlocal self
  255. self.element_dict[element_value][index].authenticate(User,Passwd)
  256. self.add_func(f'User:Passwd:{User};{Passwd}>{element_value}[{index}]', action) # 添加func
  257. def clear(self,element_value,index=0,**kwargs):#清空文本
  258. @self.add_base
  259. def action(*args, **kwargs):
  260. nonlocal self
  261. self.element_dict[element_value][index].clear()
  262. self.add_func(f'clear_text>{element_value}[{index}]', action) # 添加func
  263. def click(self,element_value,index=0,**kwargs):#点击按钮
  264. @self.add_base
  265. def action(*args, **kwargs):
  266. nonlocal self
  267. self.element_dict[element_value][index].click()
  268. self.add_func(f'click>{element_value}[{index}]', action) # 添加func
  269. def accept(self,element_value,index=0,**kwargs):#点击确定(弹出框)
  270. @self.add_base
  271. def action(*args, **kwargs):
  272. nonlocal self
  273. self.element_dict[element_value][index].accept()
  274. self.add_func(f'accept>{element_value}[{index}]', action) # 添加func
  275. def dismiss(self,element_value,index=0,**kwargs):#点击取消(弹出框)
  276. @self.add_base
  277. def action(*args, **kwargs):
  278. nonlocal self
  279. self.element_dict[element_value][index].dismiss()
  280. self.add_func(f'dismiss>{element_value}[{index}]', action) # 添加func
  281. def submit(self,element_value,index=0,**kwargs):#提交表单
  282. @self.add_base
  283. def action(*args, **kwargs):
  284. nonlocal self
  285. self.element_dict[element_value][index].submit()
  286. self.add_func(f'submit>{element_value}[{index}]', action) # 添加func
  287. def deselect_by_index(self,element_value,deselect,index=0,**kwargs):#根据index取消选择
  288. @self.add_base
  289. def action(*args, **kwargs):
  290. nonlocal self
  291. self.element_dict[element_value][index].deselect_by_index(int(deselect))
  292. self.add_func(f'deselect_by_index:{deselect}>{element_value}[{index}]', action) # 添加func
  293. def deselect_by_text(self,element_value,deselect,index=0,**kwargs):#根据text取消选择
  294. @self.add_base
  295. def action(*args, **kwargs):
  296. nonlocal self
  297. self.element_dict[element_value][index].deselect_by_visible_text(deselect)
  298. self.add_func(f'deselect_by_text:{deselect}>{element_value}[{index}]', action) # 添加func
  299. def deselect_by_value(self,element_value,deselect,index=0,**kwargs):#根据value取消选择
  300. @self.add_base
  301. def action(*args, **kwargs):
  302. nonlocal self
  303. self.element_dict[element_value][index].deselect_by_value(deselect)
  304. self.add_func(f'deselect_by_value:{deselect}>{element_value}[{index}]', action) # 添加func
  305. def select_by_index(self,element_value,deselect,index=0,**kwargs):#根据index选择
  306. @self.add_base
  307. def action(*args, **kwargs):
  308. nonlocal self
  309. self.element_dict[element_value][index].select_by_index(int(deselect))
  310. self.add_func(f'select_by_index:{deselect}>{element_value}[{index}]', action) # 添加func
  311. def select_by_text(self,element_value,deselect,index=0,**kwargs):#根据text选择
  312. @self.add_base
  313. def action(*args, **kwargs):
  314. nonlocal self
  315. self.element_dict[element_value][index].select_by_visible_text(deselect)
  316. self.add_func(f'select_by_text:{deselect}>{element_value}[{index}]', action) # 添加func
  317. def select_by_value(self,element_value,deselect,index=0,**kwargs):#根据value选择
  318. @self.add_base
  319. def action(*args, **kwargs):
  320. nonlocal self
  321. self.element_dict[element_value][index].select_by_value(deselect)
  322. self.add_func(f'select_by_value:{deselect}>{element_value}[{index}]', action) # 添加func
  323. def back(self,**kwargs):# 返回
  324. @self.add_base
  325. def action(*args, **kwargs):
  326. nonlocal self
  327. self.browser.back()
  328. self.add_func(f'BACK', action)
  329. def forward(self,**kwargs):# 前进
  330. @self.add_base
  331. def action(*args, **kwargs):
  332. nonlocal self
  333. self.browser.forward()
  334. self.add_func(f'FORWARD', action)
  335. def refresh(self,**kwargs):# 刷新
  336. @self.add_base
  337. def action(*args, **kwargs):
  338. nonlocal self
  339. self.browser.refresh()
  340. self.add_func(f'REFRESH', action)
  341. def wait_sleep(self,time:int=2,**kwargs):#暴力等待
  342. @self.add_base
  343. def action(*args, **kwargs):
  344. nonlocal self
  345. sleep(time)
  346. self.add_func(f'WAIT:{time}s', action)
  347. def set_wait(self,time:int=2,**kwargs):#隐式等待
  348. @self.add_base
  349. def action(*args, **kwargs):
  350. nonlocal self
  351. sleep(time)
  352. self.add_func(f'Loading_wait:{time}s', action)
  353. def run_JS(self,JS,**kwargs):
  354. @self.add_base
  355. def action(num,name,*args, **kwargs):
  356. nonlocal self
  357. get = self.browser.execute_script(JS)
  358. if hasattr(get,'__getitem__'):#可切片
  359. self.element_dict[f'{name}[{num}]'] = get # 返回必须是list
  360. else:
  361. self.element_dict[f'{name}[{num}]'] = [get]
  362. self.add_func(f'run_js:{JS}', action)
  363. def to_text(self,**kwargs):#获取网页源码
  364. @self.add_base
  365. def action(num,name,*args, **kwargs):
  366. nonlocal self
  367. self.element_dict[f'{name}[{num}]'] = [self.browser.page_source,self.now_url]
  368. self.add_func(f'get_page_source', action)
  369. def out_html(self,element_value,**kwargs):#输出网页源码
  370. @self.add_base
  371. def action(*args, **kwargs):
  372. nonlocal self
  373. md5 = hashlib.md5() # 应用MD5算法
  374. md5.update(f'{time.time()}_{self.now_url}'.encode('utf-8'))
  375. name = md5.hexdigest()
  376. save_dir = self.dir + '/' + name + '.html'
  377. print(save_dir)
  378. with open(save_dir,'w') as f:
  379. f.write(self.element_dict[element_value][0])
  380. with open(save_dir + '.CoTanURL','w') as f:
  381. f.write(self.element_dict[element_value][1])
  382. self.add_func(f'write_html<{element_value}', action)
  383. def del_all_cookies(self,**kwargs):#删除所有曲奇
  384. @self.add_base
  385. def action(*args, **kwargs):
  386. nonlocal self
  387. self.browser.delete_all_cookies()
  388. self.add_func(f'del_all_cookies', action)
  389. def del_cookies(self,cookies_name,**kwargs):#删除指定曲奇
  390. @self.add_base
  391. def action(*args, **kwargs):
  392. nonlocal self
  393. self.browser.delete_cookie(cookies_name)
  394. self.add_func(f'del_cookies:{cookies_name}', action)
  395. def add_cookies(self,cookies,**kwargs):#添加指定曲奇
  396. @self.add_base
  397. def action(*args, **kwargs):
  398. nonlocal self
  399. self.browser.add_cookie(cookies)
  400. self.add_func(f'add_cookies:{cookies}', action)
  401. def update_cookies(self,cookies_name,cookies,**kwargs):#更新曲奇
  402. @self.add_base
  403. def action(*args, **kwargs):
  404. nonlocal self
  405. now_cookies = self.browser.get_cookie(cookies_name)
  406. self.browser.delete_cookie(cookies_name)
  407. now_cookies.update(cookies)
  408. self.browser.add_cookie(now_cookies)
  409. self.add_func(f'add_cookies:{cookies}', action)
  410. def get_cookies(self,cookies_name,**kwargs):#获取指定曲奇
  411. @self.add_base
  412. def action(num,name,*args, **kwargs):
  413. nonlocal self
  414. self.element_dict[f'{name}[{num}]'] = [self.browser.get_cookie(cookies_name)]
  415. self.add_func(f'get_cookies:{cookies_name}', action)
  416. def get_all_cookies(self,**kwargs):#获取所有曲奇
  417. @self.add_base
  418. def action(num,name,*args, **kwargs):
  419. nonlocal self
  420. self.element_dict[f'{name}[{num}]'] = self.browser.get_cookie()
  421. self.add_func(f'get_all_cookies', action)
  422. def make_bs(self, element_value, **kwargs): # 解析成bs4对象
  423. @self.add_base
  424. def action(num,name,*args, **kwargs):
  425. nonlocal self
  426. self.element_dict[f'{name}[{num}]'] = [bs4.BeautifulSoup(self.element_dict[element_value][0], "html.parser")]
  427. self.add_func(f'Parsing:{element_value}', action) # 添加func
  428. def listSlicing(self,index:(slice,int),element_value):
  429. if type(index) is int:
  430. return [self.element_dict[element_value][index]]
  431. else:
  432. return self.element_dict[element_value][index]
  433. def to_Database(self,element_value,index,data:(str,list),dataBase_name:str,**kwargs):#传入data Base
  434. @self.add_base
  435. def action(*args, **kwargs):
  436. global data_base
  437. nonlocal self
  438. iter_list = self.listSlicing(index, element_value)
  439. for bs in iter_list:
  440. new = []
  441. for i in data:
  442. if i == '$name&':new.append(bs.name)
  443. elif i == '$self&':new.append(str(bs).replace('\n',''))
  444. elif i == '$string$':new.append(str(bs.string).replace('\n',''))
  445. else:
  446. new.append(bs.attrs.get(i,''))
  447. data_base.add_DataBase(dataBase_name,new)
  448. self.add_func(f'DataBase:{data}<{element_value}[{index}]>{dataBase_name}', action) # 添加func
  449. def to_Database_by_re(self,element_value,index,data:str,dataBase_name:str,**kwargs):#通过正则,传入dataBase
  450. data = regular.compile(data)
  451. @self.add_base
  452. def action(*args, **kwargs):
  453. global data_base
  454. nonlocal self
  455. iter_list = self.listSlicing(index, element_value)
  456. for bs in iter_list:
  457. new = regular.findall(data,str(bs))
  458. data_base.add_DataBase(dataBase_name,new)
  459. self.add_func(f'DataBase:{data}<{element_value}[{index}]>{dataBase_name}', action) # 添加func
  460. def findAll(self, element_value,tag:(str,list),attribute:dict,limit,recursive,index:(slice,int),**kwargs):#根据标签定位
  461. if type(tag) is str:
  462. tag = str(tag).split(',')
  463. try:
  464. limit = int(limit)
  465. except:
  466. limit = None
  467. @self.add_base
  468. def action(num,name,*args, **kwargs):
  469. nonlocal self
  470. iter_list = self.listSlicing(index,element_value)
  471. paser_list = []
  472. for bs in iter_list:
  473. try:
  474. re = bs.find_all(tag,attribute,limit=limit,recursive=recursive)
  475. except:
  476. try:
  477. if str(bs.name) not in tag:raise Exception
  478. for agrs_name in attribute:
  479. text = attribute[agrs_name]
  480. if type(text) is str:
  481. if bs.attrs[agrs_name] != text:raise Exception
  482. else:#正则匹配
  483. if not regular.match(text,bs.attrs[agrs_name]): raise Exception
  484. re = [bs]
  485. except:
  486. re = []
  487. paser_list += re
  488. self.element_dict[f'{name}[{num}]'] = paser_list
  489. self.add_func(f'findAll:{element_value}[{index}]', action) # 添加func
  490. def findAll_by_text(self, element_value,text:(regular.compile,str),limit,recursive,index:(slice,int),**kwargs):#根据text定位
  491. try:
  492. limit = int(limit)
  493. except:
  494. limit = None
  495. @self.add_base
  496. def action(num,name,*args, **kwargs):
  497. nonlocal self
  498. iter_list = self.listSlicing(index,element_value)
  499. paser_list = []
  500. for bs in iter_list:
  501. try:
  502. re = bs.find_all(text=text,limit=limit,recursive=recursive)
  503. except:
  504. try:
  505. if type(text) is str:
  506. if str(bs.string) != text:raise Exception
  507. else:
  508. if not regular.match(text,str(bs.string)):raise Exception
  509. re = [bs]
  510. except:
  511. re = []
  512. paser_list += re
  513. self.element_dict[f'{name}[{num}]'] = paser_list
  514. self.add_func(f'findAll_by_text:{element_value}[{index}]', action) # 添加func
  515. def __get_other_base(self,element_value,index:(slice,int),who='children',**kwargs):#获得子、后代、兄弟标签的基类
  516. @self.add_base
  517. def action(num,name,*args, **kwargs):
  518. nonlocal self
  519. iter_list = self.listSlicing(index, element_value)
  520. paser_list = []
  521. for bs in iter_list:
  522. if who != 'brothers':
  523. paser_list += {'children':bs.children,'offspring':bs.descendants,'down':bs.next_siblings,
  524. 'up':bs.previous_siblings}.get(who,bs.children)
  525. else:
  526. paser_list += bs.previous_siblings
  527. paser_list += bs.next_siblings
  528. self.element_dict[f'{name}[{num}]'] = list(set(paser_list))
  529. self.add_func(f'get_{who}:{element_value}[{index}]', action) # 添加func
  530. def get_children(self,element_value,index:(slice,int),**kwargs):
  531. return self.__get_other_base(element_value,index)
  532. def get_offspring(self,element_value,index:(slice,int),**kwargs):
  533. return self.__get_other_base(element_value,index,'offspring')
  534. def get_up(self,element_value,index:(slice,int),**kwargs):
  535. return self.__get_other_base(element_value,index,'up')
  536. def get_down(self,element_value,index:(slice,int),**kwargs):
  537. return self.__get_other_base(element_value,index,'down')
  538. def get_brothers(self,element_value,index:(slice,int),**kwargs):
  539. return self.__get_other_base(element_value,index,'brothers')
  540. def get_by_path(self,element_value,index:(slice,int),path,**kwargs):#根据bs4的目录选择
  541. @self.add_base
  542. def action(num,name,*args, **kwargs):
  543. nonlocal self
  544. iter_list = self.listSlicing(index, element_value)
  545. paser_list = []
  546. for bs in iter_list:
  547. try:
  548. re = eval(str(path),{'self':bs})
  549. if re == None:raise Exception
  550. paser_list.append(re)
  551. except:
  552. pass
  553. self.element_dict[f'{name}[{num}]'] = paser_list
  554. self.add_func(f'get>{path}:{element_value}[{index}]', action) # 添加func
  555. def Webpage_snapshot(self,**kwargs):
  556. @self.add_base
  557. def action(num, name, *args, **kwargs):
  558. nonlocal self
  559. md5 = hashlib.md5() # 应用MD5算法
  560. md5.update(f'{time.time()}_{self.now_url}'.encode('utf-8'))
  561. name = md5.hexdigest()
  562. with open(self.dir + '/' + name + '.png.CoTanURL','w') as f:
  563. f.write(self.now_url)
  564. self.browser.save_screenshot(self.dir + '/' + name + '.png')
  565. sleep(1)
  566. self.add_func(f'Webpage_snapshot', action) # 添加func
  567. def Element_interaction(self,update_func=lambda *args:None):#元素交互
  568. func_list = self.func_list
  569. status = None
  570. def update(func_name):
  571. nonlocal status,self
  572. if status:
  573. success_code = 'Success to run'
  574. elif status == None:
  575. success_code = 'No status'
  576. else:
  577. success_code = 'Wrong to run'
  578. value_box = []
  579. for i in self.element_dict:
  580. try:
  581. value_box.append(f'{i}[{len(i)}] = {self.element_dict[i]}')
  582. except:
  583. value_box.append(f'{i} = {self.element_dict[i]}')
  584. update_func(func_name, success_code, value_box) # 信息更新系统
  585. update('start')
  586. for func_num in range(len(func_list)):
  587. func_name = func_list[func_num]
  588. update(func_name)
  589. status = self.func_dict[func_name](num=f'{func_num}',name='var')
  590. update('Finish')