Crawler_controller.py 34 KB


  1. from selenium import webdriver
  2. import threading
  3. import time
  4. from os.path import exists
  5. from os import mkdir
  6. import hashlib
  7. from time import sleep
  8. import bs4
  9. import re as regular
  10. import Information_storage
  11. import requests
  12. data_base = Information_storage.DataBase_Home()
  13. class PAGE:
  14. def __init__(self):
  15. self.url=''
  16. self.UA=''
  17. self.func = 'PAGE'
  18. def __str__(self):
  19. return f'{self.func}-{self.url}:UA>{self.UA}'
  20. class REQUESTS_Base(PAGE):
  21. def init(self,UA,url,cookies):
  22. if UA == '':
  23. UA = f'--user-agent ="Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' \
  24. f'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36 Edg/80.0.361.66"'
  25. self.UA = UA
  26. self.headers = {'Accept': 'text/html, application/xhtml+xml, image/jxr, */*',
  27. 'Accept - Encoding': 'gzip, deflate',
  28. 'Accept-Language': 'zh-Hans-CN, zh-Hans; q=0.5',
  29. 'Connection': 'Keep-Alive',
  30. 'User-Agent': UA}
  31. self.requests = lambda *args:None
  32. self.url = url
  33. self.cookies = cookies
  34. self.new = True
  35. class URL_POST(REQUESTS_Base):#通过requests的post请求
  36. def __init__(self, url, data,UA='',cookies=None, **kwargs):
  37. super(URL_POST, self).__init__()
  38. self.func = 'post'
  39. self.data = data
  40. self.requests = requests.post
  41. self.init(UA,url,cookies)
  42. def __str__(self):
  43. return super(URL_POST, self).__str__() + f';data>{self.data}'
  44. class URL_GET(REQUESTS_Base):#通过requests的post请求
  45. def __init__(self, url,UA='',cookies=None, **kwargs):
  46. super(URL_GET, self).__init__()
  47. self.func = 'simplify_get'
  48. self.requests = requests.get
  49. self.init(UA,url,cookies)
  50. class URL_PAGE(PAGE):
  51. def __init__(self,url,first_run=False,head=False,no_plugins=True,no_js=False,no_java=False,
  52. no_img=False,UA='',cookies=None,new=False,down_load_dir='',**kwargs):
  53. super(URL_PAGE, self).__init__()
  54. self.url = url
  55. self.func = 'get'
  56. self.options = webdriver.ChromeOptions()
  57. self.cookies = cookies#cookies存储位置
  58. self.new = new#新键页面or新键浏览器
  59. self.down_load_dir = down_load_dir
  60. self.init(first_run,head,no_plugins,no_js,no_java,no_img,UA)
  61. def init(self,first_run,head,no_plugins,no_js,no_java,no_img,UA):
  62. self.options.add_argument('disable-infobars')#不显示
  63. prefs = {'profile.default_content_settings.popups': 0, 'download.default_directory':self.down_load_dir}
  64. self.options.add_experimental_option('prefs', prefs)#下载设置
  65. if first_run:
  66. self.options.add_argument('-first run')
  67. if head:#无头设置
  68. print('FFF')
  69. self.options.add_argument('--headless')
  70. self.options.add_argument('--disable-gpu')
  71. if no_plugins:
  72. self.options.add_argument('--disable-plugins')
  73. if no_js:
  74. self.options.add_argument('--disable-javascript')
  75. if no_java:
  76. self.options.add_argument('--disable-java')
  77. if no_img:
  78. self.options.add_argument('blink-settings=imagesEnabled=false')
  79. if UA == '':
  80. UA = (f'user-agent ="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
  81. f'Chrome/80.0.3987.132 Safari/537.36"')
  82. # self.options.add_argument(f'--user-agent ="{UA}"')
  83. self.UA = UA
  84. def __str__(self):
  85. return f'{self.func}-{self.url}:UA>{self.UA}'
  86. class url:#url管理器
  87. num = 0#url处理器个数
  88. def __init__(self,dic=f'',dic_run=f''):
  89. url.num += 1
  90. self.save_dir = dic
  91. dic += f'/url[{url.num}].cot_url'
  92. dic_run += f'/url_run[{url.num}].cot_url'
  93. self.dir = dic
  94. self.dir_run = dic_run
  95. self.file = open(dic,'a')#写入url_history的文件
  96. self.file_run = open(dic_run,'a')#写入已读url文件
  97. self.url_list = []#待读url
  98. self.url_history = []#url历史
  99. self.filter = {}#过滤函数
  100. def filter_func(self,url,**kwargs):#url过滤系统
  101. for i in self.filter:
  102. if not self.filter[i](url): return False
  103. return True
  104. def Add_func(self,func,name):#添加过滤函数
  105. self.filter[name] = func
  106. def Del_func(self,index):#删除过滤函数
  107. del self.filter[list(self.filter.keys())[index]]
  108. def return_func(self):
  109. return list(self.filter.keys())
  110. def add_url(self,url,func,data=None,**kwargs):#添加url
  111. if func == '':func = 'simplify_get'
  112. if func == 'get':url_ = url
  113. else:
  114. url_ = url + str(data)
  115. if url_ not in self.url_history and self.filter_func(url,func=func):#1.url不存在历史,2.url满足筛选条件
  116. if func == 'get':
  117. self.url_list.append(URL_PAGE(url=url,**kwargs,down_load_dir=self.dir))#添加到待取得url
  118. elif func == 'simplify_get':
  119. self.url_list.append(URL_GET(url=url, **kwargs, down_load_dir=self.dir)) # 添加到待取得url
  120. else:
  121. self.url_list.append(URL_POST(url=url,data=data,**kwargs)) # 添加到待取得url
  122. self.url_history.append(url_)#添加到历史url
  123. self.__out_url(url_)#输出历史url
  124. return True#写入成功
  125. return False#写入失败
  126. def del_url(self,index):#删除url
  127. self.__out_url_run(f'DELETE {self.url_list[index]}')
  128. del self.url_list[index]
  129. def get_url(self) -> (URL_PAGE,URL_POST):#取得url
  130. url_page = self.url_list[0]
  131. self.__out_url_run(url_page.url)
  132. del self.url_list[0]
  133. return url_page
  134. def __out_url(self,url):#输出url历史
  135. self.file.write(f'{url}\n')
  136. self.file.flush()
  137. def __out_url_run(self,url):#输出已经运行的url
  138. self.file_run.write(f'{url}\n')
  139. self.file_run.flush()
  140. def return_url(self):
  141. return self.url_list.copy()
  142. def return_url_history(self):
  143. return self.url_history.copy()
  144. class Page_Downloader:
  145. num = 0
  146. def __init__(self,url:url,dic=''):
  147. self.url = url
  148. self.dir = dic
  149. Page_Downloader.num += 1
  150. self.page_source_dict = {}#页面保存信息
  151. self.cookie_Thread = None#子进程
  152. self.browser = None
  153. self.cookie_dict = {}
  154. self.cookie_dict_list = {}#sele的cookies
  155. self.lase_func = ''
  156. def strat_urlGet(self,*args,func_cookie):#用get请求url ->得到一个页面信息
  157. self.break_ = False
  158. self.page_source_dict = {}
  159. self.nowurl = self.url.get_url()#获取一个url
  160. url = self.nowurl.url
  161. if self.nowurl.func == 'get':
  162. if self.nowurl.new == True and self.lase_func == 'get':#重新启动
  163. self.browser.quit()
  164. self.browser = webdriver.Chrome(chrome_options=self.nowurl.options)
  165. try:
  166. self.browser.get(url)
  167. except:
  168. self.browser = webdriver.Chrome(chrome_options=self.nowurl.options)
  169. self.browser.get(url)
  170. try:
  171. if self.nowurl.new != True:raise Exception
  172. list_ = self.cookie_dict_list[self.nowurl.cookies]
  173. self.Tra_cookies()
  174. try:
  175. for i in list_:
  176. self.Add_cookies(i)
  177. except:pass
  178. except:
  179. pass
  180. self.start_cookies(func_cookie,url)
  181. else:#requests模式
  182. try:
  183. args = {'cookies':self.cookie_dict[self.nowurl.cookies]}
  184. func_cookie([args['cookies']])
  185. except:
  186. args = {}
  187. func_cookie([])
  188. if self.nowurl.func == 'post':args['data'] = self.nowurl.data
  189. self.browser = self.nowurl.requests(url,headers=self.nowurl.headers,**args)
  190. self.cookie_dict[url] = requests.utils.dict_from_cookiejar(self.browser.cookies)#保存cookies
  191. func_cookie([self.cookie_dict[url]])
  192. self.lase_func = self.nowurl.func
  193. self.Parser.browser = self.browser
  194. self.Parser.init(url)
  195. return self.browser
  196. def start_cookies(self,func_cookie,url):
  197. self.break_ = True
  198. def update_cookie():
  199. nonlocal self
  200. while self.break_:
  201. try:
  202. cookies = self.browser.get_cookies()
  203. func_cookie(cookies) # 与GUI通信显示cookie
  204. self.cookie_dict[url] = cookies
  205. time.sleep(.5)
  206. except:
  207. pass
  208. self.cookie_Thread = threading.Thread(target=update_cookie)
  209. self.cookie_Thread.start()
  210. def Del_cookies(self,name):#删除指定cookies
  211. browser = self.browser
  212. browser.delete_cookie(name)
  213. def Tra_cookies(self):#清空cookies
  214. browser = self.browser
  215. browser.delete_all_cookies()
  216. def Add_cookies(self,cookies:dict):#清空cookies
  217. browser = self.browser
  218. browser.add_cookie(cookies)
  219. def update_cookies(self,name,cookies:dict):
  220. browser = self.browser
  221. cookies_list = browser.get_cookies()
  222. for i in cookies_list:
  223. if i.get('name',None) == name:
  224. browser.delete_cookie(name)#删除原来cookies
  225. i.update(cookies)
  226. browser.add_cookie(i)
  227. return
  228. raise Exception
  229. def set_Page_Parser(self,Parser):
  230. self.Parser = Parser
  231. self.Parser.browser = self.browser
  232. self.Parser.url = self.url
  233. self.Parser.dir = self.dir
  234. class Page_Parser:
  235. def __init__(self,Downloader:Page_Downloader):
  236. self.Downloader = Downloader
  237. self.Downloader.set_Page_Parser(self)
  238. self.func_list = []
  239. self.func_dict = {}
  240. self.init()
  241. def init(self,url=''):
  242. self.element_dict = {}#记录属性的名字
  243. self.now_url = url
  244. def add_base(self,func): # 装饰器
  245. def wrap(browser=None,num=None,name=None, *args, **kwargs) -> bool:
  246. try:
  247. func(browser=browser,num=num, name=name, *args, **kwargs)
  248. return True
  249. except:
  250. return False
  251. return wrap
  252. def add_func(self,name,func):
  253. n = len(self.func_list)
  254. self.func_list.append(f'{name}[{n}]')
  255. self.func_dict[f'{name}[{n}]'] = func
  256. def return_func(self,only=True):
  257. if only:
  258. return self.func_list.copy()
  259. else:
  260. return [f'var[{index}]@ {i}' for index,i in enumerate(self.func_list.copy())]
  261. def find_ID(self,id,not_all=False,**kwargs):
  262. @self.add_base
  263. def find(browser, num, name, *args, **kwargs):
  264. nonlocal self,id
  265. if browser == None:browser = self.browser
  266. if not_all:self.element_dict[f'{name}[{num}]'] = [browser.find_element_by_id(id)]#返回必须是list
  267. else:self.element_dict[f'{name}[{num}]'] = browser.find_elements_by_id(id)
  268. self.add_func(f'find_ID:{id}',find)#添加func
  269. def find_class(self,class_name,not_all=False,**kwargs):
  270. @self.add_base
  271. def find(browser, num, name, *args, **kwargs):
  272. nonlocal self,class_name
  273. if browser == None:browser = self.browser
  274. if not_all:self.element_dict[f'{name}[{num}]'] = [browser.find_element_by_class_name(class_name)]#返回必须是list
  275. else:self.element_dict[f'{name}[{num}]'] = browser.find_elements_by_class_name(class_name)#返回必须是list
  276. self.add_func(f'find_class:{class_name}',find)#添加func
  277. def find_name(self,name_,not_all=False,**kwargs):
  278. @self.add_base
  279. def find(browser, num, name, *args, **kwargs):
  280. nonlocal self,name_
  281. if browser == None:browser = self.browser
  282. if not_all:self.element_dict[f'{name}[{num}]'] = [browser.find_element_by_name(name_)]#返回必须是list
  283. else:self.element_dict[f'{name}[{num}]'] = browser.find_elements_by_name(name_)#返回必须是list
  284. self.add_func(f'find_name:{name_}',find)#添加func
  285. def find_xpath(self,xpath,not_all=False,**kwargs):
  286. @self.add_base
  287. def find(browser, num, name, *args, **kwargs):
  288. nonlocal self,xpath
  289. if browser == None:browser = self.browser
  290. if not_all:self.element_dict[f'{name}[{num}]'] = [browser.find_element_by_xpath(xpath)]#返回必须是list
  291. else:self.element_dict[f'{name}[{num}]'] = browser.find_elements_by_xpath(xpath)#返回必须是list
  292. self.add_func(f'find_xpath:{xpath}',find)#添加func
  293. def find_css(self,css_selector,not_all=False,**kwargs):
  294. @self.add_base
  295. def find(browser, num, name, *args, **kwargs):
  296. nonlocal self,css_selector
  297. if browser == None:browser = self.browser
  298. if not_all:self.element_dict[f'{name}[{num}]'] = [browser.find_element_by_css_selector(css_selector)]#返回必须是list
  299. else:self.element_dict[f'{name}[{num}]'] = browser.find_elements_by_css_selector(css_selector)#返回必须是list
  300. self.add_func(f'find_css:{css_selector}',find)#添加func
  301. def find_tag_name(self,tag_name,not_all=False,**kwargs):
  302. @self.add_base
  303. def find(browser, num, name, *args, **kwargs):
  304. nonlocal self,tag_name
  305. if browser == None:browser = self.browser
  306. if not_all:self.element_dict[f'{name}[{num}]'] = [browser.find_element_by_tag_name(tag_name)]#返回必须是list
  307. else:self.element_dict[f'{name}[{num}]'] = browser.find_elements_by_tag_name(tag_name)#返回必须是list
  308. self.add_func(f'find_tagName:{tag_name}',find)#添加func\
  309. def find_link_text(self,link_text,not_all=False,**kwargs):#匹配link
  310. @self.add_base
  311. def find(browser, num, name, *args, **kwargs):
  312. nonlocal self,link_text
  313. if browser == None:browser = self.browser
  314. if not_all:self.element_dict[f'{name}[{num}]'] = [browser.find_element_by_link_text(link_text)]#返回必须是list
  315. else:self.element_dict[f'{name}[{num}]'] = browser.find_elements_by_link_text(link_text)#返回必须是list
  316. self.add_func(f'find_link_text:{link_text}',find)#添加func
  317. def find_partial_link_text(self,partial_link_text,not_all=False,**kwargs):#模糊匹配
  318. @self.add_base
  319. def find(browser, num, name, *args, **kwargs):
  320. nonlocal self,partial_link_text
  321. if browser == None:browser = self.browser
  322. if not_all:self.element_dict[f'{name}[{num}]'] = [browser.find_element_by_partial_link_text(partial_link_text)]#返回必须是list
  323. else:self.element_dict[f'{name}[{num}]'] = [browser.find_element_by_partial_link_text(partial_link_text)]#返回必须是list
  324. self.add_func(f'find_partial_link_text:{partial_link_text}',find)#添加func
  325. def find_switch_to_alert(self,*args,**kwargs):#定位弹出框
  326. @self.add_base
  327. def find(browser, num, name, *args, **kwargs):
  328. nonlocal self
  329. if browser == None:browser = self.browser
  330. self.element_dict[f'{name}[{num}]'] = [browser.switch_to.alert()]
  331. self.add_func(f'find_alert',find)#添加func
  332. def find_switch_to_active_element(self,*args,**kwargs):#定位焦点元素
  333. @self.add_base
  334. def find(browser, num, name, *args, **kwargs):
  335. nonlocal self
  336. if browser == None:browser = self.browser
  337. self.element_dict[f'{name}[{num}]'] = [browser.switch_to.active_element()]
  338. self.add_func(f'active_element',find)#添加func
  339. def find_switch_to_frame(self,reference,is_id=False,*args,**kwargs):#定位Frame
  340. @self.add_base
  341. def find(browser, num, name, *args, **kwargs):
  342. nonlocal self,reference,is_id
  343. if browser == None:browser = self.browser
  344. if reference == None:
  345. self.element_dict[f'{name}[{num}]'] = [browser.default_content()]# 回到主文档
  346. elif reference == '':
  347. self.element_dict[f'{name}[{num}]'] = [browser.parent_frame()]# 回到父文档
  348. else:
  349. if is_id:reference = int(reference)
  350. self.element_dict[f'{name}[{num}]'] = [browser.switch_to.frame(str(reference))]# 定位进入文档
  351. func_name = {None:'主文档','':'父文档'}.get(reference,reference)
  352. self.add_func(f'find_frame:{func_name}',find)#添加func
  353. def send_keys(self,text,element_value,index=0,**kwargs):#输入文字
  354. @self.add_base
  355. def action(*args, **kwargs):
  356. nonlocal self
  357. self.element_dict[element_value][index].send_keys(text)
  358. self.add_func(f'sent_text:{text}>{element_value}[{index}]', action) # 添加func
  359. def User_Passwd(self,User,Passwd,element_value,index=0,**kwargs):#输入验证(User&Password)
  360. @self.add_base
  361. def action(*args, **kwargs):
  362. nonlocal self
  363. self.element_dict[element_value][index].authenticate(User,Passwd)
  364. self.add_func(f'User:Passwd:{User};{Passwd}>{element_value}[{index}]', action) # 添加func
  365. def clear(self,element_value,index=0,**kwargs):#清空文本
  366. @self.add_base
  367. def action(*args, **kwargs):
  368. nonlocal self
  369. self.element_dict[element_value][index].clear()
  370. self.add_func(f'clear_text>{element_value}[{index}]', action) # 添加func
  371. def click(self,element_value,index=0,**kwargs):#点击按钮
  372. @self.add_base
  373. def action(*args, **kwargs):
  374. nonlocal self
  375. self.element_dict[element_value][index].click()
  376. self.add_func(f'click>{element_value}[{index}]', action) # 添加func
  377. def accept(self,element_value,index=0,**kwargs):#点击确定(弹出框)
  378. @self.add_base
  379. def action(*args, **kwargs):
  380. nonlocal self
  381. self.element_dict[element_value][index].accept()
  382. self.add_func(f'accept>{element_value}[{index}]', action) # 添加func
  383. def dismiss(self,element_value,index=0,**kwargs):#点击取消(弹出框)
  384. @self.add_base
  385. def action(*args, **kwargs):
  386. nonlocal self
  387. self.element_dict[element_value][index].dismiss()
  388. self.add_func(f'dismiss>{element_value}[{index}]', action) # 添加func
  389. def submit(self,element_value,index=0,**kwargs):#提交表单
  390. @self.add_base
  391. def action(*args, **kwargs):
  392. nonlocal self
  393. self.element_dict[element_value][index].submit()
  394. self.add_func(f'submit>{element_value}[{index}]', action) # 添加func
  395. def deselect_by_index(self,element_value,deselect,index=0,**kwargs):#根据index取消选择
  396. @self.add_base
  397. def action(*args, **kwargs):
  398. nonlocal self
  399. self.element_dict[element_value][index].deselect_by_index(int(deselect))
  400. self.add_func(f'deselect_by_index:{deselect}>{element_value}[{index}]', action) # 添加func
  401. def deselect_by_text(self,element_value,deselect,index=0,**kwargs):#根据text取消选择
  402. @self.add_base
  403. def action(*args, **kwargs):
  404. nonlocal self
  405. self.element_dict[element_value][index].deselect_by_visible_text(deselect)
  406. self.add_func(f'deselect_by_text:{deselect}>{element_value}[{index}]', action) # 添加func
  407. def deselect_by_value(self,element_value,deselect,index=0,**kwargs):#根据value取消选择
  408. @self.add_base
  409. def action(*args, **kwargs):
  410. nonlocal self
  411. self.element_dict[element_value][index].deselect_by_value(deselect)
  412. self.add_func(f'deselect_by_value:{deselect}>{element_value}[{index}]', action) # 添加func
  413. def select_by_index(self,element_value,deselect,index=0,**kwargs):#根据index选择
  414. @self.add_base
  415. def action(*args, **kwargs):
  416. nonlocal self
  417. self.element_dict[element_value][index].select_by_index(int(deselect))
  418. self.add_func(f'select_by_index:{deselect}>{element_value}[{index}]', action) # 添加func
  419. def select_by_text(self,element_value,deselect,index=0,**kwargs):#根据text选择
  420. @self.add_base
  421. def action(*args, **kwargs):
  422. nonlocal self
  423. self.element_dict[element_value][index].select_by_visible_text(deselect)
  424. self.add_func(f'select_by_text:{deselect}>{element_value}[{index}]', action) # 添加func
  425. def select_by_value(self,element_value,deselect,index=0,**kwargs):#根据value选择
  426. @self.add_base
  427. def action(*args, **kwargs):
  428. nonlocal self
  429. self.element_dict[element_value][index].select_by_value(deselect)
  430. self.add_func(f'select_by_value:{deselect}>{element_value}[{index}]', action) # 添加func
  431. def back(self,**kwargs):# 返回
  432. @self.add_base
  433. def action(*args, **kwargs):
  434. nonlocal self
  435. self.browser.back()
  436. self.add_func(f'BACK', action)
  437. def forward(self,**kwargs):# 前进
  438. @self.add_base
  439. def action(*args, **kwargs):
  440. nonlocal self
  441. self.browser.forward()
  442. self.add_func(f'FORWARD', action)
  443. def refresh(self,**kwargs):# 刷新
  444. @self.add_base
  445. def action(*args, **kwargs):
  446. nonlocal self
  447. self.browser.refresh()
  448. self.add_func(f'REFRESH', action)
  449. def wait_sleep(self,time:int=2,**kwargs):#暴力等待
  450. @self.add_base
  451. def action(*args, **kwargs):
  452. nonlocal self
  453. sleep(time)
  454. self.add_func(f'WAIT:{time}s', action)
  455. def set_wait(self,time:int=2,**kwargs):#隐式等待
  456. @self.add_base
  457. def action(*args, **kwargs):
  458. nonlocal self
  459. sleep(time)
  460. self.add_func(f'Loading_wait:{time}s', action)
  461. def run_JS(self,JS,**kwargs):
  462. @self.add_base
  463. def action(num,name,*args, **kwargs):
  464. nonlocal self
  465. get = self.browser.execute_script(JS)
  466. if hasattr(get,'__getitem__'):#可切片
  467. self.element_dict[f'{name}[{num}]'] = get # 返回必须是list
  468. else:
  469. self.element_dict[f'{name}[{num}]'] = [get]
  470. self.add_func(f'run_js:{JS}', action)
  471. def to_text(self,**kwargs):#获取网页源码
  472. @self.add_base
  473. def action(num,name,*args, **kwargs):
  474. nonlocal self
  475. try:
  476. self.element_dict[f'{name}[{num}]'] = [self.browser.page_source,self.now_url]
  477. except:
  478. self.element_dict[f'{name}[{num}]'] = [self.browser.text, self.now_url]#request
  479. self.add_func(f'get_page_source', action)
  480. def out_html(self,element_value,**kwargs):#输出网页源码
  481. @self.add_base
  482. def action(*args, **kwargs):
  483. nonlocal self
  484. md5 = hashlib.md5() # 应用MD5算法
  485. md5.update(f'{time.time()}_{self.now_url}'.encode('utf-8'))
  486. name = md5.hexdigest()
  487. save_dir = self.dir + '/' + name + '.cotan_source'
  488. print(save_dir)
  489. with open(save_dir,'w') as f:
  490. f.write(self.element_dict[element_value][0])
  491. with open(save_dir + '.CoTanURL','w') as f:
  492. f.write(self.element_dict[element_value][1])
  493. self.add_func(f'write_html<{element_value}', action)
  494. def del_all_cookies(self,**kwargs):#删除所有曲奇
  495. @self.add_base
  496. def action(*args, **kwargs):
  497. nonlocal self
  498. self.browser.delete_all_cookies()
  499. self.add_func(f'del_all_cookies', action)
  500. def del_cookies(self,cookies_name,**kwargs):#删除指定曲奇
  501. @self.add_base
  502. def action(*args, **kwargs):
  503. nonlocal self
  504. self.browser.delete_cookie(cookies_name)
  505. self.add_func(f'del_cookies:{cookies_name}', action)
  506. def add_cookies(self,cookies,**kwargs):#添加指定曲奇
  507. @self.add_base
  508. def action(*args, **kwargs):
  509. nonlocal self
  510. self.browser.add_cookie(cookies)
  511. self.add_func(f'add_cookies:{cookies}', action)
  512. def update_cookies(self,cookies_name,cookies,**kwargs):#更新曲奇
  513. @self.add_base
  514. def action(*args, **kwargs):
  515. nonlocal self
  516. now_cookies = self.browser.get_cookie(cookies_name)
  517. self.browser.delete_cookie(cookies_name)
  518. now_cookies.update(cookies)
  519. self.browser.add_cookie(now_cookies)
  520. self.add_func(f'add_cookies:{cookies}', action)
  521. def get_cookies(self,cookies_name,**kwargs):#获取指定曲奇
  522. @self.add_base
  523. def action(num,name,*args, **kwargs):
  524. nonlocal self
  525. self.element_dict[f'{name}[{num}]'] = [self.browser.get_cookie(cookies_name)]
  526. self.add_func(f'get_cookies:{cookies_name}', action)
  527. def get_all_cookies(self,**kwargs):#获取所有曲奇
  528. @self.add_base
  529. def action(num,name,*args, **kwargs):
  530. nonlocal self
  531. self.element_dict[f'{name}[{num}]'] = self.browser.get_cookie()
  532. self.add_func(f'get_all_cookies', action)
  533. def make_bs(self, element_value, **kwargs): # 解析成bs4对象
  534. @self.add_base
  535. def action(num,name,*args, **kwargs):
  536. nonlocal self
  537. self.element_dict[f'{name}[{num}]'] = [bs4.BeautifulSoup(self.element_dict[element_value][0], "html.parser")]
  538. self.add_func(f'Parsing:{element_value}', action) # 添加func
  539. def listSlicing(self,index:(slice,int),element_value):
  540. if type(index) is int:
  541. return [self.element_dict[element_value][index]]
  542. else:
  543. return self.element_dict[element_value][index]
  544. def to_Database(self,element_value,index,data:(str,list),dataBase_name:str,**kwargs):#传入data Base
  545. @self.add_base
  546. def action(*args, **kwargs):
  547. global data_base
  548. nonlocal self
  549. iter_list = self.listSlicing(index, element_value)
  550. for bs in iter_list:
  551. new = []
  552. for i in data:
  553. if i == '$name&':new.append(bs.name)
  554. elif i == '$self&':new.append(str(bs).replace('\n',''))
  555. elif i == '$string$':new.append(str(bs.string).replace('\n',''))
  556. else:
  557. new.append(bs.attrs.get(i,''))
  558. data_base.add_DataBase(dataBase_name,new)
  559. self.add_func(f'DataBase:{data}<{element_value}[{index}]>{dataBase_name}', action) # 添加func
  560. def to_Database_by_re(self,element_value,index,data:str,dataBase_name:str,**kwargs):#通过正则,传入dataBase
  561. data = regular.compile(data)
  562. @self.add_base
  563. def action(*args, **kwargs):
  564. global data_base
  565. nonlocal self
  566. iter_list = self.listSlicing(index, element_value)
  567. for bs in iter_list:
  568. new = regular.findall(data,str(bs))
  569. data_base.add_DataBase(dataBase_name,new)
  570. self.add_func(f'DataBase:{data}<{element_value}[{index}]>{dataBase_name}', action) # 添加func
  571. def findAll(self, element_value,tag:(str,list),attribute:dict,limit,recursive,index:(slice,int),**kwargs):#根据标签定位
  572. if type(tag) is str:
  573. tag = str(tag).split(',')
  574. try:
  575. limit = int(limit)
  576. except:
  577. limit = None
  578. @self.add_base
  579. def action(num,name,*args, **kwargs):
  580. nonlocal self
  581. iter_list = self.listSlicing(index,element_value)
  582. paser_list = []
  583. for bs in iter_list:
  584. try:
  585. re = bs.find_all(tag,attribute,limit=limit,recursive=recursive)
  586. except:
  587. try:
  588. if str(bs.name) not in tag:raise Exception
  589. for agrs_name in attribute:
  590. text = attribute[agrs_name]
  591. if type(text) is str:
  592. if bs.attrs[agrs_name] != text:raise Exception
  593. else:#正则匹配
  594. if not regular.match(text,bs.attrs[agrs_name]): raise Exception
  595. re = [bs]
  596. except:
  597. re = []
  598. paser_list += re
  599. self.element_dict[f'{name}[{num}]'] = paser_list
  600. self.add_func(f'findAll:{element_value}[{index}]', action) # 添加func
  601. def findAll_by_text(self, element_value,text:(regular.compile,str),limit,recursive,index:(slice,int),**kwargs):#根据text定位
  602. try:
  603. limit = int(limit)
  604. except:
  605. limit = None
  606. @self.add_base
  607. def action(num,name,*args, **kwargs):
  608. nonlocal self
  609. iter_list = self.listSlicing(index,element_value)
  610. paser_list = []
  611. for bs in iter_list:
  612. try:
  613. re = bs.find_all(text=text,limit=limit,recursive=recursive)
  614. except:
  615. try:
  616. if type(text) is str:
  617. if str(bs.string) != text:raise Exception
  618. else:
  619. if not regular.match(text,str(bs.string)):raise Exception
  620. re = [bs]
  621. except:
  622. re = []
  623. paser_list += re
  624. self.element_dict[f'{name}[{num}]'] = paser_list
  625. self.add_func(f'findAll_by_text:{element_value}[{index}]', action) # 添加func
  626. def __get_other_base(self,element_value,index:(slice,int),who='children',**kwargs):#获得子、后代、兄弟标签的基类
  627. @self.add_base
  628. def action(num,name,*args, **kwargs):
  629. nonlocal self
  630. iter_list = self.listSlicing(index, element_value)
  631. paser_list = []
  632. for bs in iter_list:
  633. if who != 'brothers':
  634. paser_list += {'children':bs.children,'offspring':bs.descendants,'down':bs.next_siblings,
  635. 'up':bs.previous_siblings}.get(who,bs.children)
  636. else:
  637. paser_list += bs.previous_siblings
  638. paser_list += bs.next_siblings
  639. self.element_dict[f'{name}[{num}]'] = list(set(paser_list))
  640. self.add_func(f'get_{who}:{element_value}[{index}]', action) # 添加func
  641. def get_children(self,element_value,index:(slice,int),**kwargs):
  642. return self.__get_other_base(element_value,index)
  643. def get_offspring(self,element_value,index:(slice,int),**kwargs):
  644. return self.__get_other_base(element_value,index,'offspring')
  645. def get_up(self,element_value,index:(slice,int),**kwargs):
  646. return self.__get_other_base(element_value,index,'up')
  647. def get_down(self,element_value,index:(slice,int),**kwargs):
  648. return self.__get_other_base(element_value,index,'down')
  649. def get_brothers(self,element_value,index:(slice,int),**kwargs):
  650. return self.__get_other_base(element_value,index,'brothers')
  651. def get_by_path(self,element_value,index:(slice,int),path,**kwargs):#根据bs4的目录选择
  652. @self.add_base
  653. def action(num,name,*args, **kwargs):
  654. nonlocal self
  655. iter_list = self.listSlicing(index, element_value)
  656. paser_list = []
  657. for bs in iter_list:
  658. try:
  659. re = eval(str(path),{'self':bs})
  660. if re == None:raise Exception
  661. paser_list.append(re)
  662. except:
  663. pass
  664. self.element_dict[f'{name}[{num}]'] = paser_list
  665. self.add_func(f'get>{path}:{element_value}[{index}]', action) # 添加func
  666. def Webpage_snapshot(self,**kwargs):
  667. @self.add_base
  668. def action(*args, **kwargs):
  669. nonlocal self
  670. md5 = hashlib.md5() # 应用MD5算法
  671. md5.update(f'{time.time()}_{self.now_url}'.encode('utf-8'))
  672. name = md5.hexdigest()
  673. with open(self.dir + '/' + name + '.png.CoTanURL','w') as f:
  674. f.write(self.now_url)
  675. self.browser.save_screenshot(self.dir + '/' + name + '.png')
  676. sleep(1)
  677. self.add_func(f'Webpage_snapshot', action) # 添加func
  678. def add_url(self, element_value, index: (slice, int), url_name,update_func,url_args:dict, **kwargs):# 自动添加url
  679. @self.add_base
  680. def action(*args, **kwargs):
  681. nonlocal self
  682. iter_list = self.listSlicing(index, element_value)
  683. for bs in iter_list:
  684. try:
  685. if url_name == '$name&':
  686. new_url = bs.name
  687. elif url_name == '$self&':
  688. new_url = str(bs).replace('\n', '')
  689. elif url_name == '$string$':
  690. new_url = str(bs.string).replace('\n', '')
  691. else:
  692. new_url = bs.attrs.get(url_name, '')
  693. url.add_url(new_url, **url_args)
  694. except:
  695. pass
  696. update_func()#更新tkinter
  697. self.add_func(f'add_URL<{element_value}[{index}]:{url_name}', action) # 添加func
  698. def to_json(self,**kwargs):
  699. @self.add_base
  700. def action(num, name, *args, **kwargs):
  701. nonlocal self
  702. self.element_dict[f'{name}[{num}]'] = [self.browser.json()]#request 解析为 json
  703. self.add_func(f'to_json', action) # 添加func
  704. def Element_interaction(self,update_func=lambda *args:None):#元素交互
  705. func_list = self.func_list
  706. status = None
  707. def update(func_name):
  708. nonlocal status,self
  709. if status:
  710. success_code = 'Success to run'
  711. elif status == None:
  712. success_code = 'No status'
  713. else:
  714. success_code = 'Wrong to run'
  715. value_box = []
  716. for i in self.element_dict:
  717. try:
  718. value_box.append(f'{i}[{len(i)}] = {self.element_dict[i]}')
  719. except:
  720. value_box.append(f'{i} = {self.element_dict[i]}')
  721. update_func(func_name, success_code, value_box) # 信息更新系统
  722. update('start')
  723. for func_num in range(len(func_list)):
  724. func_name = func_list[func_num]
  725. update(func_name)
  726. status = self.func_dict[func_name](num=f'{func_num}',name='var')
  727. update('Finish')