template.py 53 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633
  1. import bs4
  2. import hashlib
  3. import os
  4. import re as regular
  5. import threading
  6. import time
  7. from abc import ABCMeta, abstractmethod
  8. from time import sleep
  9. import logging
  10. from selenium import webdriver
  11. from selenium.webdriver import ActionChains
  12. from selenium.webdriver.common.keys import Keys
  13. from selenium.common.exceptions import InvalidSessionIdException, WebDriverException
  14. import requests
  15. from system import plugin_class_loading, get_path, basicConfig
  16. logging.basicConfig(**basicConfig)
  17. keys_name_dict = {
  18. "ctrl": Keys.CONTROL,
  19. "shift": Keys.SHIFT,
  20. "tab": Keys.TAB,
  21. "left_ctrl": Keys.LEFT_CONTROL,
  22. "left_shift": Keys.LEFT_SHIFT,
  23. "left_alt": Keys.LEFT_ALT,
  24. "ALT": Keys.ALT,
  25. "enter": Keys.ENTER,
  26. "return": Keys.RETURN,
  27. "backspace": Keys.BACKSPACE,
  28. "del": Keys.DELETE,
  29. "pgup": Keys.PAGE_UP,
  30. "pgdn": Keys.PAGE_DOWN,
  31. "home": Keys.HOME,
  32. "end": Keys.END,
  33. "esc": Keys.CANCEL,
  34. "insert": Keys.INSERT,
  35. "meta": Keys.META,
  36. "up": Keys.UP,
  37. "down": Keys.DOWN,
  38. "right": Keys.RIGHT,
  39. "left": Keys.LEFT,
  40. } # 键-值映射
  41. class PageParserError(Exception):
  42. pass
  43. class UrlError(Exception):
  44. pass
  45. class CookiesError(Exception):
  46. pass
  47. class Database(metaclass=ABCMeta):
  48. @abstractmethod
  49. def __str__(self):
  50. pass
  51. @abstractmethod
  52. def close(self):
  53. pass
  54. @abstractmethod
  55. def add_new(self, data):
  56. pass
  57. @abstractmethod
  58. def remove(self):
  59. pass
  60. @abstractmethod
  61. def out_file(self, out_dir):
  62. pass
  63. class CoTanDB(Database):
  64. def __init__(self, name):
  65. self.dir = rf"{os.getcwd()}{os.sep}Database_dir{os.sep}{name}.cotanDB" # 创建DB文件
  66. self.file = open(self.dir, "r+" if os.path.exists(self.dir) else "w+")
  67. self.id = 0
  68. self.name = name
  69. for _ in self.file.readlines():
  70. self.id += 1
  71. def __str__(self):
  72. return self.name
  73. def close(self):
  74. try:
  75. self.file.close()
  76. except IOError:
  77. pass
  78. def add_new(self, data):
  79. data_str = str(self.id)
  80. for i in data:
  81. data_str += "," + str(i)
  82. data_str += "\n"
  83. self.file.write(data_str)
  84. self.file.flush()
  85. self.id += 1
  86. def remove(self):
  87. self.file.close()
  88. os.remove(self.dir)
  89. def out_file(self, out_dir):
  90. with open(out_dir + fr"{os.sep}{self.name}.contanDB", "w") as f:
  91. with open(self.dir) as g:
  92. f.write(g.read())
  93. class DatabaseControllerBase:
  94. def __init__(self):
  95. self.database = {}
  96. class AddDatabase(DatabaseControllerBase):
  97. def add_database(self, name): # 添加数据表
  98. self.database[name] = CoTanDB(name)
  99. class DatabaseControllerCustom(metaclass=ABCMeta):
  100. @abstractmethod
  101. def close(self, name):
  102. pass
  103. @abstractmethod
  104. def close_all(self):
  105. pass
  106. @abstractmethod
  107. def rm_database(self, name):
  108. pass
  109. @abstractmethod
  110. def out(self, name, save_dir):
  111. pass
  112. @abstractmethod
  113. def return_database(self):
  114. pass
  115. class DatabaseController(AddDatabase, DatabaseControllerCustom): # data base控制器
  116. def add_new(self, name, data): # 添加新内容
  117. database = self.database.get(name)
  118. if database is None:
  119. self.add_database(name)
  120. database = self.database.get(name)
  121. database.add_new(data)
  122. def close(self, name): # 关闭数据表
  123. try:
  124. self.database[name].close()
  125. del self.database[name]
  126. except IndexError:
  127. pass
  128. def close_all(self): # 关闭所有数据表
  129. for i in self.database:
  130. self.database[i].close()
  131. self.database = {}
  132. def rm_database(self, name): # 删除数据表
  133. self.database[name].remove()
  134. del self.database[name]
  135. def out(self, name, save_dir): # 输出数据表
  136. self.database[name].out_file(save_dir)
  137. def return_database(self):
  138. return list(self.database.keys())
  139. class LogBase(metaclass=ABCMeta):
  140. @abstractmethod
  141. def write(self, data):
  142. pass
  143. @abstractmethod
  144. def close(self):
  145. pass
  146. class Log(LogBase):
  147. def __init__(self, log_dir):
  148. self.log_dir = log_dir
  149. self.log_file = open(
  150. log_dir + f"{os.sep}log.coTanLog",
  151. "r+" if os.path.exists(log_dir + "log.coTanLog") else "w+",
  152. )
  153. def write(self, data):
  154. self.log_file.write(
  155. f"[{time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))}] "
  156. + data
  157. + "\n"
  158. )
  159. self.log_file.flush()
  160. def close(self):
  161. self.log_file.close()
  162. class PageBase:
  163. def __init__(self, time_out):
  164. self.url = ""
  165. self.user_agent = ""
  166. self.mode = "PAGE"
  167. self.time_out = time_out
  168. def __str__(self):
  169. return f"[{self.time_out}s]{self.mode}-{self.url}:UA>{self.user_agent}"
  170. @abstractmethod
  171. def init(self, *args, **kwargs):
  172. pass
  173. class __RequestsBase(PageBase):
  174. def init(self, user_agent, url, cookies):
  175. if user_agent == "":
  176. user_agent = (
  177. f'--user-agent ="Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
  178. f'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36 Edg/80.0.361.66"'
  179. )
  180. self.user_agent = user_agent
  181. self.headers = {
  182. "Accept": "text/html, application/xhtml+xml, image/jxr, */*",
  183. "Accept - Encoding": "gzip, deflate",
  184. "Accept-Language": "zh-Hans-CN, zh-Hans; q=0.5",
  185. "Connection": "Keep-Alive",
  186. "User-Agent": user_agent,
  187. }
  188. self.url = url
  189. self.cookies = cookies
  190. self.new = True
  191. @plugin_class_loading(get_path(r'template/crawler'))
  192. class UrlGet(__RequestsBase): # 通过requests的post请求
  193. def __init__(self, url, time_out, user_agent="", cookies=None, **kwargs):
  194. super(UrlGet, self).__init__(time_out)
  195. self.mode = "simplify_get"
  196. self.requests = requests.get
  197. self.init(user_agent, url, cookies)
  198. @plugin_class_loading(get_path(r'template/crawler'))
  199. class UrlPost(__RequestsBase): # 通过requests的post请求
  200. def __init__(self, url, data, time_out, user_agent="", cookies=None, **kwargs):
  201. super(UrlPost, self).__init__(time_out)
  202. self.mode = "post"
  203. self.data = data
  204. self.requests = requests.post
  205. self.init(user_agent, url, cookies)
  206. def __str__(self):
  207. return super(UrlPost, self).__str__() + f";data>{self.data}"
  208. @plugin_class_loading(get_path(r'template/crawler'))
  209. class UrlPage(PageBase):
  210. def __init__(
  211. self,
  212. url,
  213. time_out,
  214. first_run=False,
  215. head=False,
  216. no_plugins=True,
  217. no_js=False,
  218. no_java=False,
  219. no_img=False,
  220. user_agent="",
  221. cookies=None,
  222. new=False,
  223. down_load_dir="",
  224. **kwargs,
  225. ):
  226. super(UrlPage, self).__init__(time_out)
  227. self.url = url
  228. self.mode = "get"
  229. self.options = webdriver.ChromeOptions()
  230. self.cookies = cookies # cookies存储位置
  231. self.new = new # 新键页面or新键浏览器
  232. self.down_load_dir = down_load_dir
  233. self.init(first_run, head, no_plugins, no_js, no_java, no_img, user_agent)
  234. def init(self, first_run, head, no_plugins, no_js, no_java, no_img, user_agent):
  235. self.options.add_argument("disable-infobars") # 不显示
  236. prefs = {
  237. "profile.default_content_settings.popups": 0,
  238. "download.default_directory": self.down_load_dir,
  239. }
  240. self.options.add_experimental_option("prefs", prefs) # 下载设置
  241. if first_run:
  242. self.options.add_argument("-first run")
  243. if head: # 无头设置
  244. self.options.add_argument("--headless")
  245. self.options.add_argument("--disable-gpu")
  246. if no_plugins:
  247. self.options.add_argument("--disable-plugins")
  248. if no_js:
  249. self.options.add_argument("--disable-javascript")
  250. if no_java:
  251. self.options.add_argument("--disable-java")
  252. if no_img:
  253. self.options.add_argument("blink-settings=imagesEnabled=false")
  254. if user_agent == "":
  255. user_agent = (
  256. f'user-agent ="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
  257. f'Chrome/80.0.3987.132 Safari/537.36"'
  258. )
  259. # self.options.add_argument(f'--user-agent ="{UA}"')
  260. self.user_agent = user_agent
  261. def __str__(self):
  262. return f"{self.mode}-{self.url}:UA>{self.user_agent}"
  263. class Urlbase(metaclass=ABCMeta):
  264. url_count = 0 # url处理器个数
  265. def __init__(self, dic=f"", dic_run=f""):
  266. Urlbase.url_count += 1
  267. self.save_dir = dic
  268. dic += f"{os.sep}url[{Urlbase.url_count}].cot_url"
  269. dic_run += f"{os.sep}url_run[{Urlbase.url_count}].cot_url"
  270. self.dir = dic
  271. self.dir_run = dic_run
  272. self.file = open(dic, "a") # 写入url_history的文件
  273. self.file_run = open(dic_run, "a") # 写入已读url文件
  274. self.url_list = [] # 待读url
  275. self.url_history = [] # url历史
  276. self.filter = {} # 过滤函数
  277. @abstractmethod
  278. def filter_func(self, url, **kwargs):
  279. pass
  280. @abstractmethod
  281. def add_filter_func(self, func, name):
  282. pass
  283. @abstractmethod
  284. def del_filter_func(self, index):
  285. pass
  286. @abstractmethod
  287. def return_filter_func(self):
  288. pass
  289. @abstractmethod
  290. def close(self):
  291. pass
  292. @abstractmethod
  293. def out_url_history(self, url):
  294. pass
  295. @abstractmethod
  296. def out_url_run(self, url):
  297. pass
  298. @plugin_class_loading(get_path(r'template/crawler'))
  299. class UrlFile(Urlbase, metaclass=ABCMeta):
  300. def close(self):
  301. self.file.close()
  302. self.file_run.close()
  303. def out_url_history(self, url): # 输出url历史
  304. self.file.write(f"{url}\n")
  305. self.file.flush()
  306. def out_url_run(self, url): # 输出已经运行的url
  307. self.file_run.write(f"{url}\n")
  308. self.file_run.flush()
  309. @plugin_class_loading(get_path(r'template/crawler'))
  310. class UrlAdd(Urlbase, metaclass=ABCMeta):
  311. def filter_func(self, url, **kwargs): # url过滤系统
  312. for i in self.filter:
  313. if not self.filter[i](url):
  314. return False
  315. return True
  316. def add_filter_func(self, func, name): # 添加过滤函数
  317. self.filter[name] = func
  318. def del_filter_func(self, index): # 删除过滤函数
  319. del self.filter[list(self.filter.keys())[index]]
  320. def clean_filter_func(self):
  321. self.filter = {}
  322. def return_filter_func(self):
  323. return list(self.filter.keys())
  324. @plugin_class_loading(get_path(r'template/crawler'))
  325. class UrlReturn(Urlbase, metaclass=ABCMeta):
  326. def del_url(self, index): # 删除url
  327. self.out_url_run(f"DELETE {self.url_list[index]}")
  328. del self.url_list[index]
  329. def get_url(self): # 取得url
  330. url_page = self.url_list[0]
  331. self.out_url_run(url_page.url)
  332. del self.url_list[0]
  333. return url_page
  334. def is_finish(self):
  335. return len(self.url_list) == 0
  336. def add_url(self, url, func, data=None, must=False, **kwargs): # 添加url
  337. if func == "":
  338. func = "get"
  339. if func == "get":
  340. url_ = url
  341. else:
  342. url_ = url + str(data)
  343. if must or (url_ not in self.url_history and self.filter_func(url, func=func)): # 1.url不存在历史,2.url满足筛选条件
  344. if func == "get":
  345. self.url_list.append(
  346. UrlPage(url=url, **kwargs, down_load_dir=self.dir)
  347. ) # 添加到待取得url
  348. elif func == "simplify_get":
  349. self.url_list.append(
  350. UrlGet(url=url, **kwargs, down_load_dir=self.dir)
  351. ) # 添加到待取得url
  352. else:
  353. self.url_list.append(UrlPost(url=url, data=data, **kwargs)) # 添加到待取得url
  354. self.url_history.append(url_) # 添加到历史url
  355. self.out_url_history(url_) # 输出历史url
  356. return True # 写入成功
  357. return False # 写入失败
  358. class SeleniumBase(metaclass=ABCMeta):
  359. @abstractmethod
  360. def selenium_mode(self, func_cookie, url):
  361. pass
  362. class RequestsBase(metaclass=ABCMeta):
  363. @abstractmethod
  364. def requests_mode(self, func_cookie, url):
  365. pass
  366. class PagedownloaderBase(SeleniumBase, RequestsBase, metaclass=ABCMeta):
  367. downloader_count = 0
  368. def __init__(self, url, dic=""):
  369. self.url = url
  370. self.dir = dic
  371. self.log = Log(dic)
  372. PagedownloaderBase.downloader_count += 1
  373. self.page_source_dict = {} # 页面保存信息
  374. self.cookie_Thread = None # 子进程
  375. self.browser = None
  376. self.cookie_dict = {}
  377. self.cookie_dict_list = {} # sele的cookies
  378. self.last_mode = None
  379. def set_page_parser(self, parser):
  380. self.parser = parser
  381. @abstractmethod
  382. def monitoring_add_cookies(self, cookies):
  383. pass
  384. @abstractmethod
  385. def monitoring_clear_cookier(self):
  386. pass
  387. @plugin_class_loading(get_path(r'template/crawler'))
  388. class PageDownloaderRun(PagedownloaderBase, metaclass=ABCMeta):
  389. def close(self):
  390. self.log.close()
  391. def stop(self):
  392. self.break_ = False
  393. if self.last_mode is not None:
  394. try:
  395. self.browser.quit()
  396. except InvalidSessionIdException:
  397. pass
  398. self.last_mode = None
  399. def start_to_run(self, *args, func_cookie): # 用get请求url ->得到一个页面信息
  400. self.break_ = False
  401. self.page_source_dict = {}
  402. self.url_text = self.url.get_url() # 获取一个url
  403. url = self.url_text.url
  404. try:
  405. if self.url_text.mode == "get":
  406. self.selenium_mode(func_cookie, url)
  407. else: # requests模式
  408. self.requests_mode(func_cookie, url)
  409. except BaseException:
  410. raise CookiesError
  411. finally:
  412. self.last_mode = self.url_text.mode
  413. self.parser.browser = self.browser
  414. self.parser.init(url)
  415. return self.browser
  416. @plugin_class_loading(get_path(r'template/crawler'))
  417. class PageDownloaderCookies(PagedownloaderBase, metaclass=ABCMeta):
  418. def monitoring_del_cookies(self, name): # 删除指定cookies
  419. self.browser.delete_cookie(name)
  420. def monitoring_clear_cookier(self): # 清空cookies
  421. self.browser.delete_all_cookies()
  422. def monitoring_add_cookies(self, cookies: dict): # 新增cookies
  423. self.browser.add_cookie(cookies)
  424. def monitoring_update_cookies(self, name, cookies: dict):
  425. cookies_list = self.browser.get_cookies()
  426. for i in cookies_list:
  427. if i.get("name", None) == name:
  428. self.browser.delete_cookie(name) # 删除原来cookies
  429. i.update(cookies)
  430. self.browser.add_cookie(i)
  431. return
  432. raise Exception
  433. @plugin_class_loading(get_path(r'template/crawler'))
  434. class PageDownloaderRequests(PageDownloaderRun, metaclass=ABCMeta):
  435. def requests_start_cookies(self, func_cookie, url):
  436. self.cookie_dict[url] = requests.utils.dict_from_cookiejar(
  437. self.browser.cookies
  438. ) # 保存cookies
  439. func_cookie([self.cookie_dict[url]])
  440. def requests_run(self, parameters, url):
  441. self.browser = self.url_text.requests(
  442. url,
  443. headers=self.url_text.headers,
  444. timeout=self.url_text.time_out,
  445. **parameters,
  446. )
  447. def requests_data(self, parameters):
  448. if self.url_text.mode == "post":
  449. parameters["data"] = self.url_text.data
  450. return parameters
  451. def requests_cookies(self, func_cookie):
  452. try:
  453. parameters = {"cookies": self.cookie_dict[self.url_text.cookies]}
  454. except KeyError:
  455. parameters = {}
  456. func_cookie([])
  457. else:
  458. func_cookie([parameters["cookies"]])
  459. return parameters
  460. def requests_mode(self, func_cookie, url):
  461. parameters = self.requests_cookies(func_cookie)
  462. parameters = self.requests_data(parameters)
  463. self.requests_run(parameters, url)
  464. self.requests_start_cookies(func_cookie, url)
  465. @plugin_class_loading(get_path(r'template/crawler'))
  466. class PageDownloaderSelenium(PageDownloaderRun, metaclass=ABCMeta):
  467. def selenium_quit(self):
  468. try:
  469. self.browser.quit()
  470. except InvalidSessionIdException:
  471. pass
  472. def selenium_cookies(self):
  473. try:
  474. if not self.url_text.new:
  475. raise UrlError
  476. cookies_list = self.cookie_dict_list[self.url_text.cookies]
  477. except (UrlError, KeyError):
  478. pass
  479. else:
  480. self.monitoring_clear_cookier()
  481. try:
  482. for i in cookies_list:
  483. self.monitoring_add_cookies(i)
  484. except WebDriverException:
  485. pass
  486. def start_selenium(self, quit_=True):
  487. if quit_:
  488. self.selenium_quit()
  489. self.browser = webdriver.Chrome(chrome_options=self.url_text.options)
  490. def selenium_run(self, url):
  491. self.browser.set_page_load_timeout(self.url_text.time_out) # 设置页面加载超时
  492. self.browser.set_script_timeout(self.url_text.time_out) # 设置页面异步js执行超时
  493. self.browser.get(url)
  494. def selenium_start_cookies(self, func_cookie, url):
  495. self.break_ = True
  496. def update_cookie():
  497. nonlocal self
  498. while self.break_:
  499. try:
  500. cookies = self.browser.get_cookies()
  501. func_cookie(cookies) # 与GUI通信显示cookie
  502. self.cookie_dict[url] = cookies
  503. time.sleep(0.5)
  504. except WebDriverException:
  505. pass
  506. self.cookie_Thread = threading.Thread(target=update_cookie)
  507. self.cookie_Thread.start()
  508. def selenium_mode(self, func_cookie, url):
  509. if self.url_text.new and self.last_mode == "get": # 重新启动
  510. self.start_selenium()
  511. elif self.last_mode is None:
  512. self.start_selenium(False)
  513. try:
  514. self.selenium_run(url)
  515. except WebDriverException:
  516. self.start_selenium()
  517. self.selenium_run(url)
  518. self.selenium_cookies()
  519. self.selenium_start_cookies(func_cookie, url)
  520. class PageParserBase:
  521. def __init__(self, downloader):
  522. self.downloader = downloader
  523. self.downloader.set_page_parser(self)
  524. self.func_list = []
  525. self.func_dict = {}
  526. self.n = 0
  527. self.init()
  528. def init(self, url=""):
  529. self.element_dict = {} # 记录属性的名字
  530. self.url_text = url
  531. @staticmethod
  532. def add_base(func): # 装饰器
  533. def wrap(num=None, name=None, *args, **kwargs):
  534. try:
  535. func(num=num, name=name, *args, **kwargs)
  536. return True, ''
  537. except BaseException as e:
  538. return False, str(e)
  539. return wrap
  540. @plugin_class_loading(get_path(r'template/crawler'))
  541. class PageParserFunc(PageParserBase):
  542. def tra_func(self):
  543. self.func_list = []
  544. self.func_dict = {}
  545. self.n = 0
  546. def add_func(self, name, func):
  547. self.func_list.append(f"{name}[{self.n}]")
  548. self.func_dict[f"{name}[{self.n}]"] = func
  549. self.n += 1
  550. def del_func(self, index, end=False):
  551. if end:
  552. index = len(self.func_list) - index - 1
  553. del self.func_dict[self.func_list[index]]
  554. self.func_list[index] = "Func_have_been_del"
  555. self.func_dict["Func_have_been_del"] = lambda *args, **kwargs: None
  556. def return_func(self, only=True):
  557. if only:
  558. return self.func_list.copy()
  559. else:
  560. return [
  561. f"var[{index}]@ {i}" for index, i in enumerate(self.func_list.copy())
  562. ]
  563. @plugin_class_loading(get_path(r'template/crawler'))
  564. class PageParserFind(PageParserFunc):
  565. def find_id(self, id_, not_all=False, **kwargs):
  566. @self.add_base
  567. def find(num, name, *args, **kwargs):
  568. nonlocal self, id_
  569. if not_all:
  570. self.element_dict[f"{name}[{num}]"] = [
  571. self.browser.find_element_by_id(id_)
  572. ] # 返回必须是list
  573. else:
  574. self.element_dict[f"{name}[{num}]"] = self.browser.find_elements_by_id(id_)
  575. self.add_func(f"find_ID:{id_}", find) # 添加func
  576. def find_class(self, class_name, not_all=False, **kwargs):
  577. @self.add_base
  578. def find(num, name, *args, **kwargs):
  579. nonlocal self, class_name
  580. self.browser = self.browser
  581. if not_all:
  582. self.element_dict[f"{name}[{num}]"] = [
  583. self.browser.find_element_by_class_name(class_name)
  584. ] # 返回必须是list
  585. else:
  586. self.element_dict[
  587. f"{name}[{num}]"
  588. ] = self.browser.find_elements_by_class_name(
  589. class_name
  590. ) # 返回必须是list
  591. self.add_func(f"find_class:{class_name}", find) # 添加func
  592. def find_name(self, name_, not_all=False, **kwargs):
  593. @self.add_base
  594. def find(num, name, *args, **kwargs):
  595. nonlocal self, name_
  596. if not_all:
  597. self.element_dict[f"{name}[{num}]"] = [
  598. self.browser.find_element_by_name(name_)
  599. ] # 返回必须是list
  600. else:
  601. self.element_dict[f"{name}[{num}]"] = self.browser.find_elements_by_name(
  602. name_
  603. ) # 返回必须是list
  604. self.add_func(f"find_name:{name_}", find) # 添加func
  605. def find_xpath(self, xpath, not_all=False, **kwargs):
  606. @self.add_base
  607. def find(num, name, *args, **kwargs):
  608. nonlocal self, xpath
  609. if not_all:
  610. self.element_dict[f"{name}[{num}]"] = [
  611. self.browser.find_element_by_xpath(xpath)
  612. ] # 返回必须是list
  613. else:
  614. self.element_dict[f"{name}[{num}]"] = self.browser.find_elements_by_xpath(
  615. xpath
  616. ) # 返回必须是list
  617. self.add_func(f"find_xpath:{xpath}", find) # 添加func
  618. def find_css(self, css_selector, not_all=False, **kwargs):
  619. @self.add_base
  620. def find(num, name, *args, **kwargs):
  621. nonlocal self, css_selector
  622. if not_all:
  623. self.element_dict[f"{name}[{num}]"] = [
  624. self.browser.find_element_by_css_selector(css_selector)
  625. ] # 返回必须是list
  626. else:
  627. self.element_dict[
  628. f"{name}[{num}]"
  629. ] = self.browser.find_elements_by_css_selector(
  630. css_selector
  631. ) # 返回必须是list
  632. self.add_func(f"find_css:{css_selector}", find) # 添加func
  633. def find_tag_name(self, tag_name, not_all=False, **kwargs):
  634. @self.add_base
  635. def find(num, name, *args, **kwargs):
  636. nonlocal self, tag_name
  637. if not_all:
  638. self.element_dict[f"{name}[{num}]"] = [
  639. self.browser.find_element_by_tag_name(tag_name)
  640. ] # 返回必须是list
  641. else:
  642. self.element_dict[f"{name}[{num}]"] = self.browser.find_elements_by_tag_name(
  643. tag_name
  644. ) # 返回必须是list
  645. self.add_func(f"find_tagName:{tag_name}", find) # 添加func\
  646. def find_link_text(self, link_text, not_all=False, **kwargs): # 匹配link
  647. @self.add_base
  648. def find(num, name, *args, **kwargs):
  649. nonlocal self, link_text
  650. if not_all:
  651. self.element_dict[f"{name}[{num}]"] = [
  652. self.browser.find_element_by_link_text(link_text)
  653. ] # 返回必须是list
  654. else:
  655. self.element_dict[
  656. f"{name}[{num}]"
  657. ] = self.browser.find_elements_by_link_text(
  658. link_text
  659. ) # 返回必须是list
  660. self.add_func(f"find_link_text:{link_text}", find) # 添加func
  661. def find_partial_link_text(
  662. self, partial_link_text, not_all=False, **kwargs
  663. ): # 模糊匹配
  664. @self.add_base
  665. def find(num, name, *args, **kwargs):
  666. nonlocal self, partial_link_text
  667. if not_all:
  668. self.element_dict[f"{name}[{num}]"] = [
  669. self.browser.find_element_by_partial_link_text(partial_link_text)
  670. ] # 返回必须是list
  671. else:
  672. self.element_dict[f"{name}[{num}]"] = [
  673. self.browser.find_element_by_partial_link_text(partial_link_text)
  674. ] # 返回必须是list
  675. self.add_func(f"find_partial_link_text:{partial_link_text}", find) # 添加func
  676. def find_switch_to_alert(self, *args, **kwargs): # 定位弹出框
  677. @self.add_base
  678. def find(num, name, *args, **kwargs):
  679. nonlocal self
  680. self.element_dict[f"{name}[{num}]"] = [self.browser.switch_to.alert()]
  681. self.add_func(f"find_alert", find) # 添加func
  682. def find_switch_to_active_element(self, *args, **kwargs): # 定位焦点元素
  683. @self.add_base
  684. def find(num, name, *args, **kwargs):
  685. nonlocal self
  686. self.element_dict[f"{name}[{num}]"] = [self.browser.switch_to.active_element()]
  687. self.add_func(f"active_element", find) # 添加func
  688. def find_switch_to_frame(self, reference, is_id=False, *args, **kwargs): # 定位Frame
  689. @self.add_base
  690. def find(num, name, *args, **kwargs):
  691. nonlocal self, reference, is_id
  692. if reference is None:
  693. self.element_dict[f"{name}[{num}]"] = [
  694. self.browser.default_content()
  695. ] # 回到主文档
  696. elif reference == "":
  697. self.element_dict[f"{name}[{num}]"] = [self.browser.parent_frame()] # 回到父文档
  698. else:
  699. if is_id:
  700. reference = int(reference)
  701. self.element_dict[f"{name}[{num}]"] = [
  702. self.browser.switch_to.frame(str(reference))
  703. ] # 定位进入文档
  704. func_name = {None: "主文档", "": "父文档"}.get(reference, reference)
  705. self.add_func(f"find_frame:{func_name}", find) # 添加func
  706. @plugin_class_loading(get_path(r'template/crawler'))
  707. class PageParserActionListBox(PageParserFunc):
  708. def deselect_by_index(
  709. self, element_value, deselect, index=0, **kwargs
  710. ): # 根据index取消选择
  711. @self.add_base
  712. def action(*args, **kwargs):
  713. nonlocal self
  714. self.element_dict[element_value][index].deselect_by_index(int(deselect))
  715. self.add_func(
  716. f"deselect_by_index:{deselect}>{element_value}[{index}]", action
  717. ) # 添加func
  718. def deselect_by_text(
  719. self, element_value, deselect, index=0, **kwargs
  720. ): # 根据text取消选择
  721. @self.add_base
  722. def action(*args, **kwargs):
  723. nonlocal self
  724. self.element_dict[element_value][index].deselect_by_visible_text(deselect)
  725. self.add_func(
  726. f"deselect_by_text:{deselect}>{element_value}[{index}]", action
  727. ) # 添加func
  728. def deselect_by_value(
  729. self, element_value, deselect, index=0, **kwargs
  730. ): # 根据value取消选择
  731. @self.add_base
  732. def action(*args, **kwargs):
  733. nonlocal self
  734. self.element_dict[element_value][index].deselect_by_value(deselect)
  735. self.add_func(
  736. f"deselect_by_value:{deselect}>{element_value}[{index}]", action
  737. ) # 添加func
  738. def select_by_index(self, element_value, deselect, index=0, **kwargs): # 根据index选择
  739. @self.add_base
  740. def action(*args, **kwargs):
  741. nonlocal self
  742. self.element_dict[element_value][index].select_by_index(int(deselect))
  743. self.add_func(
  744. f"select_by_index:{deselect}>{element_value}[{index}]", action
  745. ) # 添加func
  746. def select_by_text(self, element_value, deselect, index=0, **kwargs): # 根据text选择
  747. @self.add_base
  748. def action(*args, **kwargs):
  749. nonlocal self
  750. self.element_dict[element_value][index].select_by_visible_text(deselect)
  751. self.add_func(
  752. f"select_by_text:{deselect}>{element_value}[{index}]", action
  753. ) # 添加func
  754. def select_by_value(self, element_value, deselect, index=0, **kwargs): # 根据value选择
  755. @self.add_base
  756. def action(*args, **kwargs):
  757. nonlocal self
  758. self.element_dict[element_value][index].select_by_value(deselect)
  759. self.add_func(
  760. f"select_by_value:{deselect}>{element_value}[{index}]", action
  761. ) # 添加func
  762. @plugin_class_loading(get_path(r'template/crawler'))
  763. class PageParserAction(PageParserFunc):
  764. def send_keys(self, text, element_value, index=0, **kwargs): # 输入文字
  765. @self.add_base
  766. def action(*args, **kwargs):
  767. nonlocal self
  768. self.element_dict[element_value][index].send_keys(text)
  769. self.add_func(f"sent_text:{text}>{element_value}[{index}]", action) # 添加func
  770. def authentication(
  771. self, user, passwd, element_value, index=0, **kwargs
  772. ): # 输入验证(User&Password)
  773. @self.add_base
  774. def action(*args, **kwargs):
  775. nonlocal self
  776. self.element_dict[element_value][index].authenticate(user, passwd)
  777. self.add_func(
  778. f"Authentication:{user};{passwd}>{element_value}[{index}]", action
  779. ) # 添加func
  780. def clear(self, element_value, index=0, **kwargs): # 清空文本
  781. @self.add_base
  782. def action(*args, **kwargs):
  783. nonlocal self
  784. self.element_dict[element_value][index].clear()
  785. self.add_func(f"clear_text>{element_value}[{index}]", action) # 添加func
  786. def click(self, element_value, index=0, **kwargs): # 点击按钮
  787. @self.add_base
  788. def action(*args, **kwargs):
  789. nonlocal self
  790. self.element_dict[element_value][index].click()
  791. self.add_func(f"click>{element_value}[{index}]", action) # 添加func
  792. def accept(self, element_value, index=0, **kwargs): # 点击确定(弹出框)
  793. @self.add_base
  794. def action(*args, **kwargs):
  795. nonlocal self
  796. self.element_dict[element_value][index].accept()
  797. self.add_func(f"accept>{element_value}[{index}]", action) # 添加func
  798. def dismiss(self, element_value, index=0, **kwargs): # 点击取消(弹出框)
  799. @self.add_base
  800. def action(*args, **kwargs):
  801. nonlocal self
  802. self.element_dict[element_value][index].dismiss()
  803. self.add_func(f"dismiss>{element_value}[{index}]", action) # 添加func
  804. def submit(self, element_value, index=0, **kwargs): # 提交表单
  805. @self.add_base
  806. def action(*args, **kwargs):
  807. nonlocal self
  808. self.element_dict[element_value][index].submit()
  809. self.add_func(f"submit>{element_value}[{index}]", action) # 添加func
  810. def run_js(self, js, **kwargs):
  811. @self.add_base
  812. def action(num, name, *args, **kwargs):
  813. nonlocal self
  814. get = self.browser.execute_script(js)
  815. if hasattr(get, "__getitem__"): # 可切片
  816. self.element_dict[f"{name}[{num}]"] = get # 返回必须是list
  817. else:
  818. self.element_dict[f"{name}[{num}]"] = [get]
  819. self.add_func(f"run_js:{js}", action)
  820. class PageParserAutomation(PageParserFind, PageParserActionListBox, PageParserAction):
  821. pass
  822. @plugin_class_loading(get_path(r'template/crawler'))
  823. class PageParserCookies(PageParserFunc):
  824. def del_all_cookies(self, **kwargs): # 删除所有曲奇
  825. @self.add_base
  826. def action(*args, **kwargs):
  827. nonlocal self
  828. self.browser.delete_all_cookies()
  829. self.add_func(f"del_all_cookies", action)
  830. def del_cookies(self, cookies_name, **kwargs): # 删除指定曲奇
  831. @self.add_base
  832. def action(*args, **kwargs):
  833. nonlocal self
  834. self.browser.delete_cookie(cookies_name)
  835. self.add_func(f"del_cookies:{cookies_name}", action)
  836. def add_cookies(self, cookies, **kwargs): # 添加指定曲奇
  837. @self.add_base
  838. def action(*args, **kwargs):
  839. nonlocal self
  840. self.browser.add_cookie(cookies)
  841. self.add_func(f"add_cookies:{cookies}", action)
  842. def update_cookies(self, cookies_name, cookies, **kwargs): # 更新曲奇
  843. @self.add_base
  844. def action(*args, **kwargs):
  845. nonlocal self
  846. now_cookies = self.browser.get_cookie(cookies_name)
  847. self.browser.delete_cookie(cookies_name)
  848. now_cookies.update(cookies)
  849. self.browser.add_cookie(now_cookies)
  850. self.add_func(f"add_cookies:{cookies}", action)
  851. def get_cookies(self, cookies_name, **kwargs): # 获取指定曲奇
  852. @self.add_base
  853. def action(num, name, *args, **kwargs):
  854. nonlocal self
  855. self.element_dict[f"{name}[{num}]"] = [
  856. self.browser.get_cookie(cookies_name)
  857. ]
  858. self.add_func(f"get_cookies:{cookies_name}", action)
  859. def get_all_cookies(self, **kwargs): # 获取所有曲奇
  860. @self.add_base
  861. def action(num, name, *args, **kwargs):
  862. nonlocal self
  863. self.element_dict[f"{name}[{num}]"] = self.browser.get_cookie()
  864. self.add_func(f"get_all_cookies", action)
  865. @plugin_class_loading(get_path(r'template/crawler'))
  866. class PageParserBrowserActions(PageParserFunc):
  867. def back(self, **kwargs): # 返回
  868. @self.add_base
  869. def action(*args, **kwargs):
  870. nonlocal self
  871. self.browser.back()
  872. self.add_func(f"BACK", action)
  873. def forward(self, **kwargs): # 前进
  874. @self.add_base
  875. def action(*args, **kwargs):
  876. nonlocal self
  877. self.browser.forward()
  878. self.add_func(f"FORWARD", action)
  879. def refresh(self, **kwargs): # 刷新
  880. @self.add_base
  881. def action(*args, **kwargs):
  882. nonlocal self
  883. self.browser.refresh()
  884. self.add_func(f"REFRESH", action)
  885. def wait_sleep(self, time: int = 2, **kwargs): # 暴力等待
  886. @self.add_base
  887. def action(*args, **kwargs):
  888. nonlocal self
  889. sleep(time)
  890. self.add_func(f"WAIT:{time}s", action)
  891. def set_wait(self, time: int = 2, **kwargs): # 隐式等待
  892. @self.add_base
  893. def action(*args, **kwargs):
  894. nonlocal self
  895. sleep(time)
  896. self.add_func(f"Loading_wait:{time}s", action)
  897. class PageParserBrowser(PageParserBrowserActions, PageParserCookies):
  898. pass
  899. @plugin_class_loading(get_path(r'template/crawler'))
  900. class PageParserNeighbor(PageParserFunc):
  901. def __get_other_base(
  902. self, element_value, index: (slice, int), who="children", **kwargs
  903. ): # 获得子、后代、兄弟标签的基类
  904. @self.add_base
  905. def action(num, name, *args, **kwargs):
  906. nonlocal self
  907. iter_list = self.list_slicing(index, element_value)
  908. paser_list = []
  909. for bs in iter_list:
  910. if who != "brothers":
  911. paser_list += {
  912. "children": bs.children,
  913. "offspring": bs.descendants,
  914. "down": bs.next_siblings,
  915. "up": bs.previous_siblings,
  916. }.get(who, bs.children)
  917. else:
  918. paser_list += bs.previous_siblings
  919. paser_list += bs.next_siblings
  920. self.element_dict[f"{name}[{num}]"] = list(set(paser_list))
  921. self.add_func(f"get_{who}:{element_value}[{index}]", action) # 添加func
  922. def get_children(self, element_value, index: (slice, int), **kwargs):
  923. return self.__get_other_base(element_value, index)
  924. def get_offspring(self, element_value, index: (slice, int), **kwargs):
  925. return self.__get_other_base(element_value, index, "offspring")
  926. def get_up(self, element_value, index: (slice, int), **kwargs):
  927. return self.__get_other_base(element_value, index, "up")
  928. def get_down(self, element_value, index: (slice, int), **kwargs):
  929. return self.__get_other_base(element_value, index, "down")
  930. def get_brothers(self, element_value, index: (slice, int), **kwargs):
  931. return self.__get_other_base(element_value, index, "brothers")
  932. @plugin_class_loading(get_path(r'template/crawler'))
  933. class PageParserDataFindall(PageParserFunc):
  934. def findall(
  935. self,
  936. element_value,
  937. tag: (str, list),
  938. attribute: dict,
  939. limit,
  940. recursive,
  941. index: (slice, int),
  942. **kwargs,
  943. ): # 根据标签定位
  944. if isinstance(tag, str):
  945. tag = str(tag).split(",")
  946. try:
  947. limit = int(limit)
  948. except ValueError:
  949. limit = None
  950. @self.add_base
  951. def action(num, name, *args, **kwargs):
  952. nonlocal self
  953. iter_list = self.list_slicing(index, element_value)
  954. paser_list = []
  955. for bs in iter_list:
  956. try:
  957. re = bs.find_all(tag, attribute, limit=limit, recursive=recursive)
  958. except AttributeError:
  959. try:
  960. if str(bs.name) not in tag:
  961. raise PageParserError
  962. for agrs_name in attribute:
  963. text = attribute[agrs_name]
  964. if isinstance(text, str):
  965. if bs.attrs[agrs_name] != text:
  966. raise PageParserError
  967. else: # 正则匹配
  968. if not regular.match(text, bs.attrs[agrs_name]):
  969. raise PageParserError
  970. re = [bs]
  971. except PageParserError:
  972. re = []
  973. paser_list += re
  974. self.element_dict[f"{name}[{num}]"] = paser_list
  975. self.add_func(f"findAll:{element_value}[{index}]", action) # 添加func
  976. def findall_by_text(
  977. self,
  978. element_value,
  979. text: (regular.compile, str),
  980. limit,
  981. recursive,
  982. index: (slice, int),
  983. **kwargs,
  984. ): # 根据text定位
  985. try:
  986. limit = int(limit)
  987. except ValueError:
  988. limit = None
  989. @self.add_base
  990. def action(num, name, *args, **kwargs):
  991. nonlocal self
  992. iter_list = self.list_slicing(index, element_value)
  993. paser_list = []
  994. for bs in iter_list:
  995. try:
  996. re = bs.find_all(text=text, limit=limit, recursive=recursive)
  997. except AttributeError:
  998. try:
  999. if isinstance(text, str):
  1000. if str(bs.string) != text:
  1001. raise PageParserError
  1002. else:
  1003. if not regular.match(text, str(bs.string)):
  1004. raise PageParserError
  1005. re = [bs]
  1006. except PageParserError:
  1007. re = []
  1008. paser_list += re
  1009. self.element_dict[f"{name}[{num}]"] = paser_list
  1010. self.add_func(f"findAll_by_text:{element_value}[{index}]", action) # 添加func
  1011. @plugin_class_loading(get_path(r'template/crawler'))
  1012. class PageParserDatabase(PageParserFunc):
  1013. def to_database(
  1014. self, element_value, index, data: (str, list), database_name: str, **kwargs
  1015. ): # 传入data Base
  1016. @self.add_base
  1017. def action(*args, **kwargs):
  1018. global data_base
  1019. nonlocal self
  1020. iter_list = self.list_slicing(index, element_value)
  1021. for bs in iter_list:
  1022. new = []
  1023. for i in data:
  1024. if i == "$name&":
  1025. new.append(bs.name)
  1026. elif i == "$self&":
  1027. new.append(str(bs).replace("\n", ""))
  1028. elif i == "$string$":
  1029. new.append(str(bs.string).replace("\n", ""))
  1030. else:
  1031. new.append(bs.attrs.get(i, ""))
  1032. data_base.add_database(database_name, new)
  1033. self.add_func(
  1034. f"DataBase:{data}<{element_value}[{index}]>{database_name}", action
  1035. ) # 添加func
  1036. def to_database_by_re(
  1037. self, element_value, index, data: str, database_name: str, **kwargs
  1038. ): # 通过正则,传入dataBase
  1039. data = regular.compile(data)
  1040. @self.add_base
  1041. def action(*args, **kwargs):
  1042. global data_base
  1043. nonlocal self
  1044. iter_list = self.list_slicing(index, element_value)
  1045. for bs in iter_list:
  1046. new = regular.findall(data, str(bs))
  1047. data_base.add_database(database_name, new)
  1048. self.add_func(
  1049. f"DataBase:{data}<{element_value}[{index}]>{database_name}", action
  1050. ) # 添加func
  1051. @plugin_class_loading(get_path(r'template/crawler'))
  1052. class PageParserDataSource(PageParserFunc):
  1053. def to_text(self, **kwargs): # 获取网页源码
  1054. @self.add_base
  1055. def action(num, name, *args, **kwargs):
  1056. nonlocal self
  1057. try:
  1058. self.element_dict[f"{name}[{num}]"] = [
  1059. self.browser.page_source,
  1060. self.url_text,
  1061. ]
  1062. except AttributeError:
  1063. self.element_dict[f"{name}[{num}]"] = [
  1064. self.browser.text,
  1065. self.url_text,
  1066. ] # request
  1067. self.add_func(f"get_page_source", action)
  1068. def out_html(self, element_value, **kwargs): # 输出网页源码
  1069. @self.add_base
  1070. def action(*args, **kwargs):
  1071. nonlocal self
  1072. md5 = hashlib.md5() # 应用MD5算法
  1073. md5.update(f"{time.time()}_{self.url_text}".encode("utf-8"))
  1074. name = md5.hexdigest()
  1075. save_dir = self.dir + f"{os.sep}" + name + ".cotan_source"
  1076. print(save_dir)
  1077. with open(save_dir, "w") as f:
  1078. f.write(self.element_dict[element_value][0])
  1079. with open(save_dir + ".CoTanURL", "w") as f:
  1080. f.write(self.element_dict[element_value][1])
  1081. self.add_func(f"write_html<{element_value}", action)
  1082. def make_bs(self, element_value, **kwargs): # 解析成bs4对象
  1083. @self.add_base
  1084. def action(num, name, *args, **kwargs):
  1085. nonlocal self
  1086. self.element_dict[f"{name}[{num}]"] = [
  1087. bs4.BeautifulSoup(self.element_dict[element_value][0], "html.parser")
  1088. ]
  1089. self.add_func(f"Parsing:{element_value}", action) # 添加func
  1090. def add_url(
  1091. self,
  1092. element_value,
  1093. index: (slice, int),
  1094. url_name,
  1095. update_func,
  1096. url_args: dict,
  1097. **kwargs,
  1098. ): # 自动添加url
  1099. @self.add_base
  1100. def action(*args, **kwargs):
  1101. nonlocal self
  1102. iter_list = self.list_slicing(index, element_value)
  1103. for bs in iter_list:
  1104. try:
  1105. if url_name == "$name&":
  1106. new_url = bs.name
  1107. elif url_name == "$self&":
  1108. new_url = str(bs).replace("\n", "")
  1109. elif url_name == "$string$":
  1110. new_url = str(bs.string).replace("\n", "")
  1111. else:
  1112. new_url = bs.attrs.get(url_name, "")
  1113. self.downloader.url.add_url(new_url, **url_args)
  1114. except AttributeError:
  1115. pass
  1116. update_func() # 更新tkinter
  1117. self.add_func(f"add_URL<{element_value}[{index}]:{url_name}", action) # 添加func
  1118. def to_json(self, **kwargs):
  1119. @self.add_base
  1120. def action(num, name, *args, **kwargs):
  1121. nonlocal self
  1122. self.element_dict[f"{name}[{num}]"] = [
  1123. self.browser.json()
  1124. ] # request 解析为 json
  1125. self.add_func(f"to_json", action) # 添加func
  1126. @plugin_class_loading(get_path(r'template/crawler'))
  1127. class PageParserTool(PageParserFunc):
  1128. def list_slicing(self, index: (slice, int), element_value):
  1129. if isinstance(index, int):
  1130. return [self.element_dict[element_value][index]]
  1131. else:
  1132. return self.element_dict[element_value][index]
  1133. def get_by_path(
  1134. self, element_value, index: (slice, int), path, **kwargs
  1135. ): # 根据bs4的目录选择
  1136. @self.add_base
  1137. def action(num, name, *args, **kwargs):
  1138. nonlocal self
  1139. iter_list = self.list_slicing(index, element_value)
  1140. paser_list = []
  1141. for bs in iter_list:
  1142. try:
  1143. re = eval(str(path), {"self": bs})
  1144. if re is None:
  1145. raise PageParserError
  1146. paser_list.append(re)
  1147. except BaseException as e:
  1148. logging.warning(str(e))
  1149. self.element_dict[f"{name}[{num}]"] = paser_list
  1150. self.add_func(f"get>{path}:{element_value}[{index}]", action) # 添加func
  1151. def webpage_snapshot(self, **kwargs):
  1152. @self.add_base
  1153. def action(*args, **kwargs):
  1154. nonlocal self
  1155. md5 = hashlib.md5() # 应用MD5算法
  1156. md5.update(f"{time.time()}_{self.url_text}".encode("utf-8"))
  1157. name = md5.hexdigest()
  1158. with open(self.dir + f"{os.sep}" + name + ".png.CoTanURL", "w") as f:
  1159. f.write(self.url_text)
  1160. self.browser.save_screenshot(self.dir + f"{os.sep}" + name + ".png")
  1161. sleep(1)
  1162. self.add_func(f"Webpage_snapshot", action) # 添加func
  1163. class PageParserData(PageParserDatabase, PageParserDataSource, PageParserDataFindall, PageParserTool):
  1164. pass
  1165. @plugin_class_loading(get_path(r'template/crawler'))
  1166. class PageParserChainsWindow(PageParserFunc):
  1167. def get_all_windows(self, *args, **kwargs): # 获取所有句柄
  1168. @self.add_base
  1169. def find(num, name, *args, **kwargs):
  1170. nonlocal self
  1171. # 获得窗口句柄
  1172. self.element_dict[f"{name}[{num}]"] = self.browser.window_handles
  1173. self.add_func(f"get_all_windows", find) # 添加func
  1174. def get_now_windows(self, *args, **kwargs): # 获取当前窗口句柄
  1175. @self.add_base
  1176. def find(num, name, *args, **kwargs):
  1177. nonlocal self
  1178. self.element_dict[f"{name}[{num}]"] = [
  1179. self.browser.current_window_handle
  1180. ] # 获得当前窗口句柄
  1181. self.add_func(f"get_now_window", find) # 添加func
  1182. def switch_to_windwos(self, element_value, index=0, **kwargs): # 切换窗口
  1183. @self.add_base
  1184. def action(*args, **kwargs):
  1185. nonlocal self
  1186. self.browser.switch_to.window(self.element_dict[element_value][index])
  1187. self.add_func(f"switch_to_window>{element_value}[{index}]", action) # 添加func
  1188. @plugin_class_loading(get_path(r'template/crawler'))
  1189. class PageParserClick(PageParserFunc):
  1190. def action_click(self, chains, element_value, index, **kwargs): # 单击左
  1191. @self.add_base
  1192. def action(*args, **kwargs):
  1193. nonlocal self
  1194. self.element_dict[chains][0].click(self.element_dict[element_value][index])
  1195. self.add_func(f"[{chains}]click>[{element_value}][{index}]", action) # 添加func
  1196. def action_double_click(self, chains, element_value, index, **kwargs): # 双击左
  1197. @self.add_base
  1198. def action(*args, **kwargs):
  1199. nonlocal self
  1200. self.element_dict[chains][0].double_click(
  1201. self.element_dict[element_value][index]
  1202. )
  1203. self.add_func(
  1204. f"[{chains}]double_click>[{element_value}][{index}]", action
  1205. ) # 添加func
  1206. def action_click_right(self, chains, element_value, index, **kwargs): # 点击右
  1207. @self.add_base
  1208. def action(*args, **kwargs):
  1209. nonlocal self
  1210. self.element_dict[chains][0].context_click(
  1211. self.element_dict[element_value][index]
  1212. )
  1213. self.add_func(
  1214. f"[{chains}]right_click>[{element_value}][{index}]", action
  1215. ) # 添加func
  1216. def action_click_and_hold(self, chains, element_value, index, **kwargs): # 按住左
  1217. @self.add_base
  1218. def action(*args, **kwargs):
  1219. nonlocal self
  1220. self.element_dict[chains][0].click_and_hold(
  1221. self.element_dict[element_value][index]
  1222. )
  1223. self.add_func(
  1224. f"[{chains}]click_and_hold>[{element_value}][{index}]", action
  1225. ) # 添加func
  1226. @plugin_class_loading(get_path(r'template/crawler'))
  1227. class PageParserChainsMouse(PageParserFunc):
  1228. def action_release(self, chains, element_value, index, **kwargs): # 松开左键
  1229. @self.add_base
  1230. def action(*args, **kwargs):
  1231. nonlocal self
  1232. self.element_dict[chains][0].release(
  1233. self.element_dict[element_value][index]
  1234. )
  1235. self.add_func(f"[{chains}]release>[{element_value}][{index}]", action) # 添加func
  1236. def action_drag_and_drop(
  1237. self, chains, element_value, index, element_value2, index2, **kwargs
  1238. ): # 拽托、松开
  1239. @self.add_base
  1240. def action(*args, **kwargs):
  1241. nonlocal self
  1242. self.element_dict[chains][0].drag_and_drop(
  1243. self.element_dict[element_value][index],
  1244. self.element_dict[element_value2][index2],
  1245. )
  1246. self.add_func(
  1247. f"[{chains}]drag_and_drop>[{element_value}][{index}]", action
  1248. ) # 添加func
  1249. def action_move(self, chains, element_value, index, **kwargs): # 移动鼠标
  1250. @self.add_base
  1251. def action(*args, **kwargs):
  1252. nonlocal self
  1253. self.element_dict[chains][0].move_to_element(
  1254. self.element_dict[element_value][index]
  1255. )
  1256. self.add_func(
  1257. f"[{chains}]drag_and_drop>[{element_value}][{index}]", action
  1258. ) # 添加func
  1259. @plugin_class_loading(get_path(r'template/crawler'))
  1260. class PageParserChainsKeys(PageParserFunc):
  1261. @staticmethod
  1262. def special_keys(key: str, is_special_keys):
  1263. if is_special_keys:
  1264. return keys_name_dict.get(key.lower(), key), f"[{key.upper()}]"
  1265. else:
  1266. return key, key
  1267. def action_key_down(
  1268. self, chains, key, element_value, index, is_special_keys, **kwargs
  1269. ): # down
  1270. new_key, key = self.special_keys(key, is_special_keys)
  1271. @self.add_base
  1272. def action(*args, **kwargs):
  1273. nonlocal self
  1274. self.element_dict[chains][0].key_down(
  1275. new_key, self.element_dict[element_value][index]
  1276. )
  1277. self.add_func(
  1278. f"[{chains}]key_down>{key}:[{element_value}][{index}]", action
  1279. ) # 添加func
  1280. def action_key_up(
  1281. self, chains, key, element_value, index, is_special_keys, **kwargs
  1282. ): # down
  1283. new_key, key = self.special_keys(key, is_special_keys)
  1284. @self.add_base
  1285. def action(*args, **kwargs):
  1286. nonlocal self
  1287. self.element_dict[chains][0].key_up(
  1288. new_key, self.element_dict[element_value][index]
  1289. )
  1290. self.add_func(
  1291. f"[{chains}]key_up>{key}:[{element_value}][{index}]", action
  1292. ) # 添加func
  1293. def action_send_keys_to_element(
  1294. self, chains, key, element_value, index, is_special_keys, **kwargs
  1295. ):
  1296. new_key, key = self.special_keys(key, is_special_keys)
  1297. @self.add_base
  1298. def action(*args, **kwargs):
  1299. nonlocal self
  1300. self.element_dict[chains][0].send_keys_to_element(
  1301. self.element_dict[element_value][index], new_key
  1302. )
  1303. self.add_func(
  1304. f"[{chains}]sent>{key}:[{element_value}][{index}]", action
  1305. ) # 添加func
  1306. def action_send_keys(self, chains, key, is_special_keys, **kwargs): # 发送到焦点元素
  1307. new_key, key = self.special_keys(key, is_special_keys)
  1308. @self.add_base
  1309. def action(*args, **kwargs):
  1310. nonlocal self
  1311. self.element_dict[chains][0].send_keys(new_key)
  1312. self.add_func(f"[{chains}].sent>{key}", action) # 添加func
  1313. @plugin_class_loading(get_path(r'template/crawler'))
  1314. class PageParserChains(PageParserChainsWindow, PageParserClick, PageParserChainsMouse,
  1315. PageParserChainsKeys):
  1316. def make_action_chains(self, **kwargs): # 创建动作链
  1317. @self.add_base
  1318. def action(num, name, *args, **kwargs):
  1319. nonlocal self
  1320. self.element_dict[f"{name}[{num}]"] = [ActionChains(self.browser)]
  1321. self.add_func(f"make_ActionChains", action) # 添加func
  1322. def action_run(self, chains, run_time=1, **kwargs): # 执行
  1323. @self.add_base
  1324. def action(*args, **kwargs):
  1325. nonlocal self
  1326. self.element_dict[chains][0].perform()
  1327. sleep(run_time)
  1328. self.add_func(f"[{chains}].run<{run_time}s", action) # 添加func
  1329. for i in range(1, 13): # F1 - F12按键
  1330. keys_name_dict[f"f{i}"] = eval(f"Keys.F{i}", {'Keys': Keys})
  1331. data_base = DatabaseController()