template.py 53 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606
  1. import bs4
  2. import hashlib
  3. import os
  4. import re as regular
  5. import threading
  6. import time
  7. from abc import ABCMeta, abstractmethod
  8. from selenium import webdriver
  9. from selenium.webdriver import ActionChains
  10. from selenium.webdriver.common.keys import Keys
  11. from time import sleep
  12. import requests
  13. from system import plugin_class_loading, get_path
  14. keys_name_dict = {
  15. "ctrl": Keys.CONTROL,
  16. "shift": Keys.SHIFT,
  17. "tab": Keys.TAB,
  18. "left_ctrl": Keys.LEFT_CONTROL,
  19. "left_shift": Keys.LEFT_SHIFT,
  20. "left_alt": Keys.LEFT_ALT,
  21. "ALT": Keys.ALT,
  22. "enter": Keys.ENTER,
  23. "return": Keys.RETURN,
  24. "backspace": Keys.BACKSPACE,
  25. "del": Keys.DELETE,
  26. "pgup": Keys.PAGE_UP,
  27. "pgdn": Keys.PAGE_DOWN,
  28. "home": Keys.HOME,
  29. "end": Keys.END,
  30. "esc": Keys.CANCEL,
  31. "insert": Keys.INSERT,
  32. "meta": Keys.META,
  33. "up": Keys.UP,
  34. "down": Keys.DOWN,
  35. "right": Keys.RIGHT,
  36. "left": Keys.LEFT,
  37. } # 键-值映射
  38. class Database(metaclass=ABCMeta):
  39. @abstractmethod
  40. def __str__(self):
  41. pass
  42. @abstractmethod
  43. def close(self):
  44. pass
  45. @abstractmethod
  46. def add_new(self, data):
  47. pass
  48. @abstractmethod
  49. def remove(self):
  50. pass
  51. @abstractmethod
  52. def out_file(self, out_dir):
  53. pass
  54. class CoTanDB(Database):
  55. def __init__(self, name):
  56. self.dir = rf"{os.getcwd()}/Database_dir/{name}.cotanDB" # 创建DB文件
  57. self.file = open(self.dir, "r+" if os.path.exists(self.dir) else "w+")
  58. self.id = 0
  59. self.name = name
  60. for _ in self.file.readlines():
  61. self.id += 1
  62. def __str__(self):
  63. return self.name
  64. def close(self):
  65. try:
  66. self.file.close()
  67. except BaseException:
  68. pass
  69. def add_new(self, data):
  70. data_str = str(self.id)
  71. for i in data:
  72. data_str += "," + str(i)
  73. data_str += "\n"
  74. self.file.write(data_str)
  75. self.file.flush()
  76. self.id += 1
  77. def remove(self):
  78. self.file.close()
  79. os.remove(self.dir)
  80. def out_file(self, out_dir):
  81. with open(out_dir + fr"/{self.name}.contanDB", "w") as f:
  82. with open(self.dir) as g:
  83. f.write(g.read())
  84. class DatabaseControllerBase:
  85. def __init__(self):
  86. self.database = {}
  87. class AddDatabase(DatabaseControllerBase):
  88. def add_database(self, name): # 添加数据表
  89. self.database[name] = CoTanDB(name)
  90. class DatabaseControllerCustom(metaclass=ABCMeta):
  91. @abstractmethod
  92. def close(self, name):
  93. pass
  94. @abstractmethod
  95. def close_all(self):
  96. pass
  97. @abstractmethod
  98. def rm_database(self, name):
  99. pass
  100. @abstractmethod
  101. def out(self, name, save_dir):
  102. pass
  103. @abstractmethod
  104. def return_database(self):
  105. pass
  106. class DatabaseController(AddDatabase, DatabaseControllerCustom): # data base控制器
  107. def add_new(self, name, data): # 添加新内容
  108. database = self.database.get(name)
  109. if database is None:
  110. self.add_database(name)
  111. database = self.database.get(name)
  112. database.add_new(data)
  113. def close(self, name): # 关闭数据表
  114. try:
  115. self.database[name].close()
  116. del self.database[name]
  117. except BaseException:
  118. pass
  119. def close_all(self): # 关闭所有数据表
  120. for i in self.database:
  121. self.database[i].close()
  122. self.database = {}
  123. def rm_database(self, name): # 删除数据表
  124. self.database[name].remove()
  125. del self.database[name]
  126. def out(self, name, save_dir): # 输出数据表
  127. self.database[name].out_file(save_dir)
  128. def return_database(self):
  129. return list(self.database.keys())
  130. class LogBase(metaclass=ABCMeta):
  131. @abstractmethod
  132. def write(self, data):
  133. pass
  134. @abstractmethod
  135. def close(self):
  136. pass
  137. class Log(LogBase):
  138. def __init__(self, log_dir):
  139. self.log_dir = log_dir
  140. self.log_file = open(
  141. log_dir + "/log.coTanLog",
  142. "r+" if os.path.exists(log_dir + "log.coTanLog") else "w+",
  143. )
  144. def write(self, data):
  145. self.log_file.write(
  146. f"[{time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))}] "
  147. + data
  148. + "\n"
  149. )
  150. self.log_file.flush()
  151. def close(self):
  152. self.log_file.close()
  153. class PageBase:
  154. def __init__(self, time_out):
  155. self.url = ""
  156. self.user_agent = ""
  157. self.mode = "PAGE"
  158. self.time_out = time_out
  159. def __str__(self):
  160. return f"[{self.time_out}s]{self.mode}-{self.url}:UA>{self.user_agent}"
  161. @abstractmethod
  162. def init(self, *args, **kwargs):
  163. pass
  164. class __RequestsBase(PageBase):
  165. def init(self, user_agent, url, cookies):
  166. if user_agent == "":
  167. user_agent = (
  168. f'--user-agent ="Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
  169. f'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36 Edg/80.0.361.66"'
  170. )
  171. self.user_agent = user_agent
  172. self.headers = {
  173. "Accept": "text/html, application/xhtml+xml, image/jxr, */*",
  174. "Accept - Encoding": "gzip, deflate",
  175. "Accept-Language": "zh-Hans-CN, zh-Hans; q=0.5",
  176. "Connection": "Keep-Alive",
  177. "User-Agent": user_agent,
  178. }
  179. self.url = url
  180. self.cookies = cookies
  181. self.new = True
  182. @plugin_class_loading(get_path(r'template/crawler'))
  183. class UrlGet(__RequestsBase): # 通过requests的post请求
  184. def __init__(self, url, time_out, user_agent="", cookies=None, **kwargs):
  185. super(UrlGet, self).__init__(time_out)
  186. self.mode = "simplify_get"
  187. self.requests = requests.get
  188. self.init(user_agent, url, cookies)
  189. @plugin_class_loading(get_path(r'template/crawler'))
  190. class UrlPost(__RequestsBase): # 通过requests的post请求
  191. def __init__(self, url, data, time_out, user_agent="", cookies=None, **kwargs):
  192. super(UrlPost, self).__init__(time_out)
  193. self.mode = "post"
  194. self.data = data
  195. self.requests = requests.post
  196. self.init(user_agent, url, cookies)
  197. def __str__(self):
  198. return super(UrlPost, self).__str__() + f";data>{self.data}"
  199. @plugin_class_loading(get_path(r'template/crawler'))
  200. class UrlPage(PageBase):
  201. def __init__(
  202. self,
  203. url,
  204. time_out,
  205. first_run=False,
  206. head=False,
  207. no_plugins=True,
  208. no_js=False,
  209. no_java=False,
  210. no_img=False,
  211. user_agent="",
  212. cookies=None,
  213. new=False,
  214. down_load_dir="",
  215. **kwargs,
  216. ):
  217. super(UrlPage, self).__init__(time_out)
  218. self.url = url
  219. self.mode = "get"
  220. self.options = webdriver.ChromeOptions()
  221. self.cookies = cookies # cookies存储位置
  222. self.new = new # 新键页面or新键浏览器
  223. self.down_load_dir = down_load_dir
  224. self.init(first_run, head, no_plugins, no_js, no_java, no_img, user_agent)
  225. def init(self, first_run, head, no_plugins, no_js, no_java, no_img, user_agent):
  226. self.options.add_argument("disable-infobars") # 不显示
  227. prefs = {
  228. "profile.default_content_settings.popups": 0,
  229. "download.default_directory": self.down_load_dir,
  230. }
  231. self.options.add_experimental_option("prefs", prefs) # 下载设置
  232. if first_run:
  233. self.options.add_argument("-first run")
  234. if head: # 无头设置
  235. self.options.add_argument("--headless")
  236. self.options.add_argument("--disable-gpu")
  237. if no_plugins:
  238. self.options.add_argument("--disable-plugins")
  239. if no_js:
  240. self.options.add_argument("--disable-javascript")
  241. if no_java:
  242. self.options.add_argument("--disable-java")
  243. if no_img:
  244. self.options.add_argument("blink-settings=imagesEnabled=false")
  245. if user_agent == "":
  246. user_agent = (
  247. f'user-agent ="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
  248. f'Chrome/80.0.3987.132 Safari/537.36"'
  249. )
  250. # self.options.add_argument(f'--user-agent ="{UA}"')
  251. self.user_agent = user_agent
  252. def __str__(self):
  253. return f"{self.mode}-{self.url}:UA>{self.user_agent}"
  254. class Urlbase(metaclass=ABCMeta):
  255. url_count = 0 # url处理器个数
  256. def __init__(self, dic=f"", dic_run=f""):
  257. Urlbase.url_count += 1
  258. self.save_dir = dic
  259. dic += f"/url[{Urlbase.url_count}].cot_url"
  260. dic_run += f"/url_run[{Urlbase.url_count}].cot_url"
  261. self.dir = dic
  262. self.dir_run = dic_run
  263. self.file = open(dic, "a") # 写入url_history的文件
  264. self.file_run = open(dic_run, "a") # 写入已读url文件
  265. self.url_list = [] # 待读url
  266. self.url_history = [] # url历史
  267. self.filter = {} # 过滤函数
  268. @abstractmethod
  269. def filter_func(self, url, **kwargs):
  270. pass
  271. @abstractmethod
  272. def add_filter_func(self, func, name):
  273. pass
  274. @abstractmethod
  275. def del_filter_func(self, index):
  276. pass
  277. @abstractmethod
  278. def return_filter_func(self):
  279. pass
  280. @abstractmethod
  281. def close(self):
  282. pass
  283. @abstractmethod
  284. def out_url_history(self, url):
  285. pass
  286. @abstractmethod
  287. def out_url_run(self, url):
  288. pass
  289. @plugin_class_loading(get_path(r'template/crawler'))
  290. class UrlFile(Urlbase, metaclass=ABCMeta):
  291. def close(self):
  292. self.file.close()
  293. self.file_run.close()
  294. def out_url_history(self, url): # 输出url历史
  295. self.file.write(f"{url}\n")
  296. self.file.flush()
  297. def out_url_run(self, url): # 输出已经运行的url
  298. self.file_run.write(f"{url}\n")
  299. self.file_run.flush()
  300. @plugin_class_loading(get_path(r'template/crawler'))
  301. class UrlAdd(Urlbase, metaclass=ABCMeta):
  302. def filter_func(self, url, **kwargs): # url过滤系统
  303. for i in self.filter:
  304. if not self.filter[i](url):
  305. return False
  306. return True
  307. def add_filter_func(self, func, name): # 添加过滤函数
  308. self.filter[name] = func
  309. def del_filter_func(self, index): # 删除过滤函数
  310. del self.filter[list(self.filter.keys())[index]]
  311. def return_filter_func(self):
  312. return list(self.filter.keys())
  313. @plugin_class_loading(get_path(r'template/crawler'))
  314. class UrlReturn(Urlbase, metaclass=ABCMeta):
  315. def del_url(self, index): # 删除url
  316. self.out_url_run(f"DELETE {self.url_list[index]}")
  317. del self.url_list[index]
  318. def get_url(self): # 取得url
  319. url_page = self.url_list[0]
  320. self.out_url_run(url_page.url)
  321. del self.url_list[0]
  322. return url_page
  323. def is_finish(self):
  324. return len(self.url_list) == 0
  325. def add_url(self, url, func, data=None, **kwargs): # 添加url
  326. if func == "":
  327. func = "get"
  328. if func == "get":
  329. url_ = url
  330. else:
  331. url_ = url + str(data)
  332. if url_ not in self.url_history and self.filter_func(url, func=func): # 1.url不存在历史,2.url满足筛选条件
  333. if func == "get":
  334. self.url_list.append(
  335. UrlPage(url=url, **kwargs, down_load_dir=self.dir)
  336. ) # 添加到待取得url
  337. elif func == "simplify_get":
  338. self.url_list.append(
  339. UrlGet(url=url, **kwargs, down_load_dir=self.dir)
  340. ) # 添加到待取得url
  341. else:
  342. self.url_list.append(UrlPost(url=url, data=data, **kwargs)) # 添加到待取得url
  343. self.url_history.append(url_) # 添加到历史url
  344. self.out_url_history(url_) # 输出历史url
  345. return True # 写入成功
  346. return False # 写入失败
  347. class SeleniumBase(metaclass=ABCMeta):
  348. @abstractmethod
  349. def selenium_mode(self, func_cookie, url):
  350. pass
  351. class RequestsBase(metaclass=ABCMeta):
  352. @abstractmethod
  353. def requests_mode(self, func_cookie, url):
  354. pass
  355. class PagedownloaderBase(SeleniumBase, RequestsBase, metaclass=ABCMeta):
  356. downloader_count = 0
  357. def __init__(self, url, dic=""):
  358. self.url = url
  359. self.dir = dic
  360. self.log = Log(dic)
  361. PagedownloaderBase.downloader_count += 1
  362. self.page_source_dict = {} # 页面保存信息
  363. self.cookie_Thread = None # 子进程
  364. self.browser = None
  365. self.cookie_dict = {}
  366. self.cookie_dict_list = {} # sele的cookies
  367. self.last_mode = None
  368. def set_page_parser(self, parser):
  369. self.parser = parser
  370. @plugin_class_loading(get_path(r'template/crawler'))
  371. class PageDownloaderRun(PagedownloaderBase, metaclass=ABCMeta):
  372. def close(self):
  373. self.log.close()
  374. def stop(self):
  375. self.break_ = False
  376. if self.last_mode is not None:
  377. try:
  378. self.browser.quit()
  379. except BaseException:
  380. pass
  381. self.last_mode = None
  382. def start_to_run(self, *args, func_cookie): # 用get请求url ->得到一个页面信息
  383. self.break_ = False
  384. self.page_source_dict = {}
  385. self.url_text = self.url.get_url() # 获取一个url
  386. url = self.url_text.url
  387. try:
  388. if self.url_text.mode == "get":
  389. self.selenium_mode(func_cookie, url)
  390. else: # requests模式
  391. self.requests_mode(func_cookie, url)
  392. except BaseException: # 避免出现错误而无法设置last_mode,进而无法关闭driver
  393. pass
  394. self.last_mode = self.url_text.mode
  395. self.parser.browser = self.browser
  396. self.parser.init(url)
  397. return self.browser
  398. @plugin_class_loading(get_path(r'template/crawler'))
  399. class PageDownloaderCookies(PagedownloaderBase, metaclass=ABCMeta):
  400. def monitoring_del_cookies(self, name): # 删除指定cookies
  401. self.browser.delete_cookie(name)
  402. def monitoring_clear_cookier(self): # 清空cookies
  403. self.browser.delete_all_cookies()
  404. def monitoring_add_cookies(self, cookies: dict): # 新增cookies
  405. self.browser.add_cookie(cookies)
  406. def monitoring_update_cookies(self, name, cookies: dict):
  407. cookies_list = self.browser.get_cookies()
  408. for i in cookies_list:
  409. if i.get("name", None) == name:
  410. self.browser.delete_cookie(name) # 删除原来cookies
  411. i.update(cookies)
  412. self.browser.add_cookie(i)
  413. return
  414. raise Exception
  415. @plugin_class_loading(get_path(r'template/crawler'))
  416. class PageDownloaderRequests(PageDownloaderRun, metaclass=ABCMeta):
  417. def requests_start_cookies(self, func_cookie, url):
  418. self.cookie_dict[url] = requests.utils.dict_from_cookiejar(
  419. self.browser.cookies
  420. ) # 保存cookies
  421. func_cookie([self.cookie_dict[url]])
  422. def requests_run(self, parameters, url):
  423. self.browser = self.url_text.requests(
  424. url,
  425. headers=self.url_text.headers,
  426. timeout=self.url_text.time_out,
  427. **parameters,
  428. )
  429. def requests_data(self, parameters):
  430. if self.url_text.mode == "post":
  431. parameters["data"] = self.url_text.data
  432. return parameters
  433. def requests_cookies(self, func_cookie):
  434. try:
  435. parameters = {"cookies": self.cookie_dict[self.url_text.cookies]}
  436. except BaseException:
  437. parameters = {}
  438. func_cookie([])
  439. else:
  440. func_cookie([parameters["cookies"]])
  441. return parameters
  442. def requests_mode(self, func_cookie, url):
  443. parameters = self.requests_cookies(func_cookie)
  444. parameters = self.requests_data(parameters)
  445. self.requests_run(parameters, url)
  446. self.requests_start_cookies(func_cookie, url)
  447. @plugin_class_loading(get_path(r'template/crawler'))
  448. class PageDownloaderSelenium(PageDownloaderRun, metaclass=ABCMeta):
  449. def selenium_quit(self):
  450. try:
  451. self.browser.quit()
  452. except BaseException:
  453. pass
  454. def selenium_cookies(self):
  455. try:
  456. if not self.url_text.new:
  457. raise Exception
  458. cookies_list = self.cookie_dict_list[self.url_text.cookies]
  459. except BaseException:
  460. pass
  461. else:
  462. self.monitoring_clear_cookier()
  463. try:
  464. for i in cookies_list:
  465. self.monitoring_add_cookies(i)
  466. except BaseException:
  467. pass
  468. def start_selenium(self, quit=True):
  469. if quit:
  470. self.selenium_quit()
  471. self.browser = webdriver.Chrome(chrome_options=self.url_text.options)
  472. def selenium_run(self, url):
  473. self.browser.set_page_load_timeout(self.url_text.time_out) # 设置页面加载超时
  474. self.browser.set_script_timeout(self.url_text.time_out) # 设置页面异步js执行超时
  475. self.browser.get(url)
  476. def selenium_start_cookies(self, func_cookie, url):
  477. self.break_ = True
  478. def update_cookie():
  479. nonlocal self
  480. while self.break_:
  481. try:
  482. cookies = self.browser.get_cookies()
  483. func_cookie(cookies) # 与GUI通信显示cookie
  484. self.cookie_dict[url] = cookies
  485. time.sleep(0.5)
  486. except BaseException:
  487. pass
  488. self.cookie_Thread = threading.Thread(target=update_cookie)
  489. self.cookie_Thread.start()
  490. def selenium_mode(self, func_cookie, url):
  491. if self.url_text.new and self.last_mode == "get": # 重新启动
  492. self.start_selenium()
  493. elif self.last_mode is None:
  494. self.start_selenium(False)
  495. try:
  496. self.selenium_run(url)
  497. except BaseException:
  498. self.start_selenium()
  499. self.selenium_run(url)
  500. self.selenium_cookies()
  501. self.selenium_start_cookies(func_cookie, url)
  502. class PageParserBase:
  503. def __init__(self, downloader):
  504. self.downloader = downloader
  505. self.downloader.set_page_parser(self)
  506. self.func_list = []
  507. self.func_dict = {}
  508. self.n = 0
  509. self.init()
  510. def init(self, url=""):
  511. self.element_dict = {} # 记录属性的名字
  512. self.url_text = url
  513. @staticmethod
  514. def add_base(func): # 装饰器
  515. def wrap(num=None, name=None, *args, **kwargs) -> bool:
  516. try:
  517. func(num=num, name=name, *args, **kwargs)
  518. return True
  519. except BaseException:
  520. return False
  521. return wrap
  522. @plugin_class_loading(get_path(r'template/crawler'))
  523. class PageParserFunc(PageParserBase):
  524. def tra_func(self):
  525. self.func_list = []
  526. self.func_dict = {}
  527. self.n = 0
  528. def add_func(self, name, func):
  529. self.func_list.append(f"{name}[{self.n}]")
  530. self.func_dict[f"{name}[{self.n}]"] = func
  531. self.n += 1
  532. def del_func(self, index, end=False):
  533. if end:
  534. index = len(self.func_list) - index - 1
  535. del self.func_dict[self.func_list[index]]
  536. self.func_list[index] = "Func_have_been_del"
  537. self.func_dict["Func_have_been_del"] = lambda *args, **kwargs: None
  538. def return_func(self, only=True):
  539. if only:
  540. return self.func_list.copy()
  541. else:
  542. return [
  543. f"var[{index}]@ {i}" for index, i in enumerate(self.func_list.copy())
  544. ]
  545. @plugin_class_loading(get_path(r'template/crawler'))
  546. class PageParserFind(PageParserFunc):
  547. def find_id(self, id, not_all=False, **kwargs):
  548. @self.add_base
  549. def find(num, name, *args, **kwargs):
  550. nonlocal self, id
  551. if not_all:
  552. self.element_dict[f"{name}[{num}]"] = [
  553. self.browser.find_element_by_id(id)
  554. ] # 返回必须是list
  555. else:
  556. self.element_dict[f"{name}[{num}]"] = self.browser.find_elements_by_id(id)
  557. self.add_func(f"find_ID:{id}", find) # 添加func
  558. def find_class(self, class_name, not_all=False, **kwargs):
  559. @self.add_base
  560. def find(num, name, *args, **kwargs):
  561. nonlocal self, class_name
  562. self.browser = self.browser
  563. if not_all:
  564. self.element_dict[f"{name}[{num}]"] = [
  565. self.browser.find_element_by_class_name(class_name)
  566. ] # 返回必须是list
  567. else:
  568. self.element_dict[
  569. f"{name}[{num}]"
  570. ] = self.browser.find_elements_by_class_name(
  571. class_name
  572. ) # 返回必须是list
  573. self.add_func(f"find_class:{class_name}", find) # 添加func
  574. def find_name(self, name_, not_all=False, **kwargs):
  575. @self.add_base
  576. def find(num, name, *args, **kwargs):
  577. nonlocal self, name_
  578. if not_all:
  579. self.element_dict[f"{name}[{num}]"] = [
  580. self.browser.find_element_by_name(name_)
  581. ] # 返回必须是list
  582. else:
  583. self.element_dict[f"{name}[{num}]"] = self.browser.find_elements_by_name(
  584. name_
  585. ) # 返回必须是list
  586. self.add_func(f"find_name:{name_}", find) # 添加func
  587. def find_xpath(self, xpath, not_all=False, **kwargs):
  588. @self.add_base
  589. def find(num, name, *args, **kwargs):
  590. nonlocal self, xpath
  591. if not_all:
  592. self.element_dict[f"{name}[{num}]"] = [
  593. self.browser.find_element_by_xpath(xpath)
  594. ] # 返回必须是list
  595. else:
  596. self.element_dict[f"{name}[{num}]"] = self.browser.find_elements_by_xpath(
  597. xpath
  598. ) # 返回必须是list
  599. self.add_func(f"find_xpath:{xpath}", find) # 添加func
  600. def find_css(self, css_selector, not_all=False, **kwargs):
  601. @self.add_base
  602. def find(num, name, *args, **kwargs):
  603. nonlocal self, css_selector
  604. if not_all:
  605. self.element_dict[f"{name}[{num}]"] = [
  606. self.browser.find_element_by_css_selector(css_selector)
  607. ] # 返回必须是list
  608. else:
  609. self.element_dict[
  610. f"{name}[{num}]"
  611. ] = self.browser.find_elements_by_css_selector(
  612. css_selector
  613. ) # 返回必须是list
  614. self.add_func(f"find_css:{css_selector}", find) # 添加func
  615. def find_tag_name(self, tag_name, not_all=False, **kwargs):
  616. @self.add_base
  617. def find(num, name, *args, **kwargs):
  618. nonlocal self, tag_name
  619. if not_all:
  620. self.element_dict[f"{name}[{num}]"] = [
  621. self.browser.find_element_by_tag_name(tag_name)
  622. ] # 返回必须是list
  623. else:
  624. self.element_dict[f"{name}[{num}]"] = self.browser.find_elements_by_tag_name(
  625. tag_name
  626. ) # 返回必须是list
  627. self.add_func(f"find_tagName:{tag_name}", find) # 添加func\
  628. def find_link_text(self, link_text, not_all=False, **kwargs): # 匹配link
  629. @self.add_base
  630. def find(num, name, *args, **kwargs):
  631. nonlocal self, link_text
  632. if not_all:
  633. self.element_dict[f"{name}[{num}]"] = [
  634. self.browser.find_element_by_link_text(link_text)
  635. ] # 返回必须是list
  636. else:
  637. self.element_dict[
  638. f"{name}[{num}]"
  639. ] = self.browser.find_elements_by_link_text(
  640. link_text
  641. ) # 返回必须是list
  642. self.add_func(f"find_link_text:{link_text}", find) # 添加func
  643. def find_partial_link_text(
  644. self, partial_link_text, not_all=False, **kwargs
  645. ): # 模糊匹配
  646. @self.add_base
  647. def find(num, name, *args, **kwargs):
  648. nonlocal self, partial_link_text
  649. if not_all:
  650. self.element_dict[f"{name}[{num}]"] = [
  651. self.browser.find_element_by_partial_link_text(partial_link_text)
  652. ] # 返回必须是list
  653. else:
  654. self.element_dict[f"{name}[{num}]"] = [
  655. self.browser.find_element_by_partial_link_text(partial_link_text)
  656. ] # 返回必须是list
  657. self.add_func(f"find_partial_link_text:{partial_link_text}", find) # 添加func
  658. def find_switch_to_alert(self, *args, **kwargs): # 定位弹出框
  659. @self.add_base
  660. def find(num, name, *args, **kwargs):
  661. nonlocal self
  662. self.element_dict[f"{name}[{num}]"] = [self.browser.switch_to.alert()]
  663. self.add_func(f"find_alert", find) # 添加func
  664. def find_switch_to_active_element(self, *args, **kwargs): # 定位焦点元素
  665. @self.add_base
  666. def find(num, name, *args, **kwargs):
  667. nonlocal self
  668. self.element_dict[f"{name}[{num}]"] = [self.browser.switch_to.active_element()]
  669. self.add_func(f"active_element", find) # 添加func
  670. def find_switch_to_frame(self, reference, is_id=False, *args, **kwargs): # 定位Frame
  671. @self.add_base
  672. def find(num, name, *args, **kwargs):
  673. nonlocal self, reference, is_id
  674. if reference is None:
  675. self.element_dict[f"{name}[{num}]"] = [
  676. self.browser.default_content()
  677. ] # 回到主文档
  678. elif reference == "":
  679. self.element_dict[f"{name}[{num}]"] = [self.browser.parent_frame()] # 回到父文档
  680. else:
  681. if is_id:
  682. reference = int(reference)
  683. self.element_dict[f"{name}[{num}]"] = [
  684. self.browser.switch_to.frame(str(reference))
  685. ] # 定位进入文档
  686. func_name = {None: "主文档", "": "父文档"}.get(reference, reference)
  687. self.add_func(f"find_frame:{func_name}", find) # 添加func
  688. @plugin_class_loading(get_path(r'template/crawler'))
  689. class PageParserActionListBox(PageParserFunc):
  690. def deselect_by_index(
  691. self, element_value, deselect, index=0, **kwargs
  692. ): # 根据index取消选择
  693. @self.add_base
  694. def action(*args, **kwargs):
  695. nonlocal self
  696. self.element_dict[element_value][index].deselect_by_index(int(deselect))
  697. self.add_func(
  698. f"deselect_by_index:{deselect}>{element_value}[{index}]", action
  699. ) # 添加func
  700. def deselect_by_text(
  701. self, element_value, deselect, index=0, **kwargs
  702. ): # 根据text取消选择
  703. @self.add_base
  704. def action(*args, **kwargs):
  705. nonlocal self
  706. self.element_dict[element_value][index].deselect_by_visible_text(deselect)
  707. self.add_func(
  708. f"deselect_by_text:{deselect}>{element_value}[{index}]", action
  709. ) # 添加func
  710. def deselect_by_value(
  711. self, element_value, deselect, index=0, **kwargs
  712. ): # 根据value取消选择
  713. @self.add_base
  714. def action(*args, **kwargs):
  715. nonlocal self
  716. self.element_dict[element_value][index].deselect_by_value(deselect)
  717. self.add_func(
  718. f"deselect_by_value:{deselect}>{element_value}[{index}]", action
  719. ) # 添加func
  720. def select_by_index(self, element_value, deselect, index=0, **kwargs): # 根据index选择
  721. @self.add_base
  722. def action(*args, **kwargs):
  723. nonlocal self
  724. self.element_dict[element_value][index].select_by_index(int(deselect))
  725. self.add_func(
  726. f"select_by_index:{deselect}>{element_value}[{index}]", action
  727. ) # 添加func
  728. def select_by_text(self, element_value, deselect, index=0, **kwargs): # 根据text选择
  729. @self.add_base
  730. def action(*args, **kwargs):
  731. nonlocal self
  732. self.element_dict[element_value][index].select_by_visible_text(deselect)
  733. self.add_func(
  734. f"select_by_text:{deselect}>{element_value}[{index}]", action
  735. ) # 添加func
  736. def select_by_value(self, element_value, deselect, index=0, **kwargs): # 根据value选择
  737. @self.add_base
  738. def action(*args, **kwargs):
  739. nonlocal self
  740. self.element_dict[element_value][index].select_by_value(deselect)
  741. self.add_func(
  742. f"select_by_value:{deselect}>{element_value}[{index}]", action
  743. ) # 添加func
  744. @plugin_class_loading(get_path(r'template/crawler'))
  745. class PageParserAction(PageParserFunc):
  746. def send_keys(self, text, element_value, index=0, **kwargs): # 输入文字
  747. @self.add_base
  748. def action(*args, **kwargs):
  749. nonlocal self
  750. self.element_dict[element_value][index].send_keys(text)
  751. self.add_func(f"sent_text:{text}>{element_value}[{index}]", action) # 添加func
  752. def authentication(
  753. self, user, passwd, element_value, index=0, **kwargs
  754. ): # 输入验证(User&Password)
  755. @self.add_base
  756. def action(*args, **kwargs):
  757. nonlocal self
  758. self.element_dict[element_value][index].authenticate(user, passwd)
  759. self.add_func(
  760. f"Authentication:{user};{passwd}>{element_value}[{index}]", action
  761. ) # 添加func
  762. def clear(self, element_value, index=0, **kwargs): # 清空文本
  763. @self.add_base
  764. def action(*args, **kwargs):
  765. nonlocal self
  766. self.element_dict[element_value][index].clear()
  767. self.add_func(f"clear_text>{element_value}[{index}]", action) # 添加func
  768. def click(self, element_value, index=0, **kwargs): # 点击按钮
  769. @self.add_base
  770. def action(*args, **kwargs):
  771. nonlocal self
  772. self.element_dict[element_value][index].click()
  773. self.add_func(f"click>{element_value}[{index}]", action) # 添加func
  774. def accept(self, element_value, index=0, **kwargs): # 点击确定(弹出框)
  775. @self.add_base
  776. def action(*args, **kwargs):
  777. nonlocal self
  778. self.element_dict[element_value][index].accept()
  779. self.add_func(f"accept>{element_value}[{index}]", action) # 添加func
  780. def dismiss(self, element_value, index=0, **kwargs): # 点击取消(弹出框)
  781. @self.add_base
  782. def action(*args, **kwargs):
  783. nonlocal self
  784. self.element_dict[element_value][index].dismiss()
  785. self.add_func(f"dismiss>{element_value}[{index}]", action) # 添加func
  786. def submit(self, element_value, index=0, **kwargs): # 提交表单
  787. @self.add_base
  788. def action(*args, **kwargs):
  789. nonlocal self
  790. self.element_dict[element_value][index].submit()
  791. self.add_func(f"submit>{element_value}[{index}]", action) # 添加func
  792. def run_js(self, js, **kwargs):
  793. @self.add_base
  794. def action(num, name, *args, **kwargs):
  795. nonlocal self
  796. get = self.browser.execute_script(js)
  797. if hasattr(get, "__getitem__"): # 可切片
  798. self.element_dict[f"{name}[{num}]"] = get # 返回必须是list
  799. else:
  800. self.element_dict[f"{name}[{num}]"] = [get]
  801. self.add_func(f"run_js:{js}", action)
  802. class PageParserAutomation(PageParserFind, PageParserActionListBox, PageParserAction):
  803. pass
  804. @plugin_class_loading(get_path(r'template/crawler'))
  805. class PageParserCookies(PageParserFunc):
  806. def del_all_cookies(self, **kwargs): # 删除所有曲奇
  807. @self.add_base
  808. def action(*args, **kwargs):
  809. nonlocal self
  810. self.browser.delete_all_cookies()
  811. self.add_func(f"del_all_cookies", action)
  812. def del_cookies(self, cookies_name, **kwargs): # 删除指定曲奇
  813. @self.add_base
  814. def action(*args, **kwargs):
  815. nonlocal self
  816. self.browser.delete_cookie(cookies_name)
  817. self.add_func(f"del_cookies:{cookies_name}", action)
  818. def add_cookies(self, cookies, **kwargs): # 添加指定曲奇
  819. @self.add_base
  820. def action(*args, **kwargs):
  821. nonlocal self
  822. self.browser.add_cookie(cookies)
  823. self.add_func(f"add_cookies:{cookies}", action)
  824. def update_cookies(self, cookies_name, cookies, **kwargs): # 更新曲奇
  825. @self.add_base
  826. def action(*args, **kwargs):
  827. nonlocal self
  828. now_cookies = self.browser.get_cookie(cookies_name)
  829. self.browser.delete_cookie(cookies_name)
  830. now_cookies.update(cookies)
  831. self.browser.add_cookie(now_cookies)
  832. self.add_func(f"add_cookies:{cookies}", action)
  833. def get_cookies(self, cookies_name, **kwargs): # 获取指定曲奇
  834. @self.add_base
  835. def action(num, name, *args, **kwargs):
  836. nonlocal self
  837. self.element_dict[f"{name}[{num}]"] = [
  838. self.browser.get_cookie(cookies_name)
  839. ]
  840. self.add_func(f"get_cookies:{cookies_name}", action)
  841. def get_all_cookies(self, **kwargs): # 获取所有曲奇
  842. @self.add_base
  843. def action(num, name, *args, **kwargs):
  844. nonlocal self
  845. self.element_dict[f"{name}[{num}]"] = self.browser.get_cookie()
  846. self.add_func(f"get_all_cookies", action)
  847. @plugin_class_loading(get_path(r'template/crawler'))
  848. class PageParserBrowserActions(PageParserFunc):
  849. def back(self, **kwargs): # 返回
  850. @self.add_base
  851. def action(*args, **kwargs):
  852. nonlocal self
  853. self.browser.back()
  854. self.add_func(f"BACK", action)
  855. def forward(self, **kwargs): # 前进
  856. @self.add_base
  857. def action(*args, **kwargs):
  858. nonlocal self
  859. self.browser.forward()
  860. self.add_func(f"FORWARD", action)
  861. def refresh(self, **kwargs): # 刷新
  862. @self.add_base
  863. def action(*args, **kwargs):
  864. nonlocal self
  865. self.browser.refresh()
  866. self.add_func(f"REFRESH", action)
  867. def wait_sleep(self, time: int = 2, **kwargs): # 暴力等待
  868. @self.add_base
  869. def action(*args, **kwargs):
  870. nonlocal self
  871. sleep(time)
  872. self.add_func(f"WAIT:{time}s", action)
  873. def set_wait(self, time: int = 2, **kwargs): # 隐式等待
  874. @self.add_base
  875. def action(*args, **kwargs):
  876. nonlocal self
  877. sleep(time)
  878. self.add_func(f"Loading_wait:{time}s", action)
  879. class PageParserBrowser(PageParserBrowserActions, PageParserCookies):
  880. pass
  881. @plugin_class_loading(get_path(r'template/crawler'))
  882. class PageParserNeighbor(PageParserFunc):
  883. def __get_other_base(
  884. self, element_value, index: (slice, int), who="children", **kwargs
  885. ): # 获得子、后代、兄弟标签的基类
  886. @self.add_base
  887. def action(num, name, *args, **kwargs):
  888. nonlocal self
  889. iter_list = self.list_slicing(index, element_value)
  890. paser_list = []
  891. for bs in iter_list:
  892. if who != "brothers":
  893. paser_list += {
  894. "children": bs.children,
  895. "offspring": bs.descendants,
  896. "down": bs.next_siblings,
  897. "up": bs.previous_siblings,
  898. }.get(who, bs.children)
  899. else:
  900. paser_list += bs.previous_siblings
  901. paser_list += bs.next_siblings
  902. self.element_dict[f"{name}[{num}]"] = list(set(paser_list))
  903. self.add_func(f"get_{who}:{element_value}[{index}]", action) # 添加func
  904. def get_children(self, element_value, index: (slice, int), **kwargs):
  905. return self.__get_other_base(element_value, index)
  906. def get_offspring(self, element_value, index: (slice, int), **kwargs):
  907. return self.__get_other_base(element_value, index, "offspring")
  908. def get_up(self, element_value, index: (slice, int), **kwargs):
  909. return self.__get_other_base(element_value, index, "up")
  910. def get_down(self, element_value, index: (slice, int), **kwargs):
  911. return self.__get_other_base(element_value, index, "down")
  912. def get_brothers(self, element_value, index: (slice, int), **kwargs):
  913. return self.__get_other_base(element_value, index, "brothers")
  914. @plugin_class_loading(get_path(r'template/crawler'))
  915. class PageParserDataFindall(PageParserFunc):
  916. def findall(
  917. self,
  918. element_value,
  919. tag: (str, list),
  920. attribute: dict,
  921. limit,
  922. recursive,
  923. index: (slice, int),
  924. **kwargs,
  925. ): # 根据标签定位
  926. if isinstance(tag, str):
  927. tag = str(tag).split(",")
  928. try:
  929. limit = int(limit)
  930. except BaseException:
  931. limit = None
  932. @self.add_base
  933. def action(num, name, *args, **kwargs):
  934. nonlocal self
  935. iter_list = self.list_slicing(index, element_value)
  936. paser_list = []
  937. for bs in iter_list:
  938. try:
  939. re = bs.find_all(tag, attribute, limit=limit, recursive=recursive)
  940. except BaseException:
  941. try:
  942. if str(bs.name) not in tag:
  943. raise Exception
  944. for agrs_name in attribute:
  945. text = attribute[agrs_name]
  946. if isinstance(text, str):
  947. if bs.attrs[agrs_name] != text:
  948. raise Exception
  949. else: # 正则匹配
  950. if not regular.match(text, bs.attrs[agrs_name]):
  951. raise Exception
  952. re = [bs]
  953. except BaseException:
  954. re = []
  955. paser_list += re
  956. self.element_dict[f"{name}[{num}]"] = paser_list
  957. self.add_func(f"findAll:{element_value}[{index}]", action) # 添加func
  958. def findall_by_text(
  959. self,
  960. element_value,
  961. text: (regular.compile, str),
  962. limit,
  963. recursive,
  964. index: (slice, int),
  965. **kwargs,
  966. ): # 根据text定位
  967. try:
  968. limit = int(limit)
  969. except BaseException:
  970. limit = None
  971. @self.add_base
  972. def action(num, name, *args, **kwargs):
  973. nonlocal self
  974. iter_list = self.list_slicing(index, element_value)
  975. paser_list = []
  976. for bs in iter_list:
  977. try:
  978. re = bs.find_all(text=text, limit=limit, recursive=recursive)
  979. except BaseException:
  980. try:
  981. if isinstance(text, str):
  982. if str(bs.string) != text:
  983. raise Exception
  984. else:
  985. if not regular.match(text, str(bs.string)):
  986. raise Exception
  987. re = [bs]
  988. except BaseException:
  989. re = []
  990. paser_list += re
  991. self.element_dict[f"{name}[{num}]"] = paser_list
  992. self.add_func(f"findAll_by_text:{element_value}[{index}]", action) # 添加func
  993. @plugin_class_loading(get_path(r'template/crawler'))
  994. class PageParserDatabase(PageParserFunc):
  995. def to_database(
  996. self, element_value, index, data: (str, list), database_name: str, **kwargs
  997. ): # 传入data Base
  998. @self.add_base
  999. def action(*args, **kwargs):
  1000. global data_base
  1001. nonlocal self
  1002. iter_list = self.list_slicing(index, element_value)
  1003. for bs in iter_list:
  1004. new = []
  1005. for i in data:
  1006. if i == "$name&":
  1007. new.append(bs.name)
  1008. elif i == "$self&":
  1009. new.append(str(bs).replace("\n", ""))
  1010. elif i == "$string$":
  1011. new.append(str(bs.string).replace("\n", ""))
  1012. else:
  1013. new.append(bs.attrs.get(i, ""))
  1014. data_base.add_database(database_name, new)
  1015. self.add_func(
  1016. f"DataBase:{data}<{element_value}[{index}]>{database_name}", action
  1017. ) # 添加func
  1018. def to_database_by_re(
  1019. self, element_value, index, data: str, database_name: str, **kwargs
  1020. ): # 通过正则,传入dataBase
  1021. data = regular.compile(data)
  1022. @self.add_base
  1023. def action(*args, **kwargs):
  1024. global data_base
  1025. nonlocal self
  1026. iter_list = self.list_slicing(index, element_value)
  1027. for bs in iter_list:
  1028. new = regular.findall(data, str(bs))
  1029. data_base.add_database(database_name, new)
  1030. self.add_func(
  1031. f"DataBase:{data}<{element_value}[{index}]>{database_name}", action
  1032. ) # 添加func
  1033. @plugin_class_loading(get_path(r'template/crawler'))
  1034. class PageParserDataSource(PageParserFunc):
  1035. def to_text(self, **kwargs): # 获取网页源码
  1036. @self.add_base
  1037. def action(num, name, *args, **kwargs):
  1038. nonlocal self
  1039. try:
  1040. self.element_dict[f"{name}[{num}]"] = [
  1041. self.browser.page_source,
  1042. self.url_text,
  1043. ]
  1044. except BaseException:
  1045. self.element_dict[f"{name}[{num}]"] = [
  1046. self.browser.text,
  1047. self.url_text,
  1048. ] # request
  1049. self.add_func(f"get_page_source", action)
  1050. def out_html(self, element_value, **kwargs): # 输出网页源码
  1051. @self.add_base
  1052. def action(*args, **kwargs):
  1053. nonlocal self
  1054. md5 = hashlib.md5() # 应用MD5算法
  1055. md5.update(f"{time.time()}_{self.url_text}".encode("utf-8"))
  1056. name = md5.hexdigest()
  1057. save_dir = self.dir + "/" + name + ".cotan_source"
  1058. print(save_dir)
  1059. with open(save_dir, "w") as f:
  1060. f.write(self.element_dict[element_value][0])
  1061. with open(save_dir + ".CoTanURL", "w") as f:
  1062. f.write(self.element_dict[element_value][1])
  1063. self.add_func(f"write_html<{element_value}", action)
  1064. def make_bs(self, element_value, **kwargs): # 解析成bs4对象
  1065. @self.add_base
  1066. def action(num, name, *args, **kwargs):
  1067. nonlocal self
  1068. self.element_dict[f"{name}[{num}]"] = [
  1069. bs4.BeautifulSoup(self.element_dict[element_value][0], "html.parser")
  1070. ]
  1071. self.add_func(f"Parsing:{element_value}", action) # 添加func
  1072. def add_url(
  1073. self,
  1074. element_value,
  1075. index: (slice, int),
  1076. url_name,
  1077. update_func,
  1078. url_args: dict,
  1079. **kwargs,
  1080. ): # 自动添加url
  1081. @self.add_base
  1082. def action(*args, **kwargs):
  1083. nonlocal self
  1084. iter_list = self.list_slicing(index, element_value)
  1085. for bs in iter_list:
  1086. try:
  1087. if url_name == "$name&":
  1088. new_url = bs.name
  1089. elif url_name == "$self&":
  1090. new_url = str(bs).replace("\n", "")
  1091. elif url_name == "$string$":
  1092. new_url = str(bs.string).replace("\n", "")
  1093. else:
  1094. new_url = bs.attrs.get(url_name, "")
  1095. self.downloader.url.add_url(new_url, **url_args)
  1096. except BaseException:
  1097. pass
  1098. update_func() # 更新tkinter
  1099. self.add_func(f"add_URL<{element_value}[{index}]:{url_name}", action) # 添加func
  1100. def to_json(self, **kwargs):
  1101. @self.add_base
  1102. def action(num, name, *args, **kwargs):
  1103. nonlocal self
  1104. self.element_dict[f"{name}[{num}]"] = [
  1105. self.browser.json()
  1106. ] # request 解析为 json
  1107. self.add_func(f"to_json", action) # 添加func
  1108. @plugin_class_loading(get_path(r'template/crawler'))
  1109. class PageParserTool(PageParserFunc):
  1110. def list_slicing(self, index: (slice, int), element_value):
  1111. if isinstance(index, int):
  1112. return [self.element_dict[element_value][index]]
  1113. else:
  1114. return self.element_dict[element_value][index]
  1115. def get_by_path(
  1116. self, element_value, index: (slice, int), path, **kwargs
  1117. ): # 根据bs4的目录选择
  1118. @self.add_base
  1119. def action(num, name, *args, **kwargs):
  1120. nonlocal self
  1121. iter_list = self.list_slicing(index, element_value)
  1122. paser_list = []
  1123. for bs in iter_list:
  1124. try:
  1125. re = eval(str(path), {"self": bs})
  1126. if re is None:
  1127. raise Exception
  1128. paser_list.append(re)
  1129. except BaseException:
  1130. pass
  1131. self.element_dict[f"{name}[{num}]"] = paser_list
  1132. self.add_func(f"get>{path}:{element_value}[{index}]", action) # 添加func
  1133. def webpage_snapshot(self, **kwargs):
  1134. @self.add_base
  1135. def action(*args, **kwargs):
  1136. nonlocal self
  1137. md5 = hashlib.md5() # 应用MD5算法
  1138. md5.update(f"{time.time()}_{self.url_text}".encode("utf-8"))
  1139. name = md5.hexdigest()
  1140. with open(self.dir + "/" + name + ".png.CoTanURL", "w") as f:
  1141. f.write(self.url_text)
  1142. self.browser.save_screenshot(self.dir + "/" + name + ".png")
  1143. sleep(1)
  1144. self.add_func(f"Webpage_snapshot", action) # 添加func
  1145. class PageParserData(PageParserDatabase, PageParserDatabase, PageParserDataSource, PageParserDataFindall,
  1146. PageParserTool):
  1147. pass
  1148. @plugin_class_loading(get_path(r'template/crawler'))
  1149. class PageParserChainsWindow(PageParserFunc):
  1150. def get_all_windows(self, *args, **kwargs): # 获取所有句柄
  1151. @self.add_base
  1152. def find(num, name, *args, **kwargs):
  1153. nonlocal self
  1154. # 获得窗口句柄
  1155. self.element_dict[f"{name}[{num}]"] = self.browser.window_handles
  1156. self.add_func(f"get_all_windows", find) # 添加func
  1157. def get_now_windows(self, *args, **kwargs): # 获取当前窗口句柄
  1158. @self.add_base
  1159. def find(num, name, *args, **kwargs):
  1160. nonlocal self
  1161. self.element_dict[f"{name}[{num}]"] = [
  1162. self.browser.current_window_handle
  1163. ] # 获得当前窗口句柄
  1164. self.add_func(f"get_now_window", find) # 添加func
  1165. def switch_to_windwos(self, element_value, index=0, **kwargs): # 切换窗口
  1166. @self.add_base
  1167. def action(*args, **kwargs):
  1168. nonlocal self
  1169. self.browser.switch_to.window(self.element_dict[element_value][index])
  1170. self.add_func(f"switch_to_window>{element_value}[{index}]", action) # 添加func
  1171. @plugin_class_loading(get_path(r'template/crawler'))
  1172. class PageParserClick(PageParserFunc):
  1173. def action_click(self, chains, element_value, index, **kwargs): # 单击左
  1174. @self.add_base
  1175. def action(*args, **kwargs):
  1176. nonlocal self
  1177. self.element_dict[chains][0].click(self.element_dict[element_value][index])
  1178. self.add_func(f"[{chains}]click>[{element_value}][{index}]", action) # 添加func
  1179. def action_double_click(self, chains, element_value, index, **kwargs): # 双击左
  1180. @self.add_base
  1181. def action(*args, **kwargs):
  1182. nonlocal self
  1183. self.element_dict[chains][0].double_click(
  1184. self.element_dict[element_value][index]
  1185. )
  1186. self.add_func(
  1187. f"[{chains}]double_click>[{element_value}][{index}]", action
  1188. ) # 添加func
  1189. def action_click_right(self, chains, element_value, index, **kwargs): # 点击右
  1190. @self.add_base
  1191. def action(*args, **kwargs):
  1192. nonlocal self
  1193. self.element_dict[chains][0].context_click(
  1194. self.element_dict[element_value][index]
  1195. )
  1196. self.add_func(
  1197. f"[{chains}]right_click>[{element_value}][{index}]", action
  1198. ) # 添加func
  1199. def action_click_and_hold(self, chains, element_value, index, **kwargs): # 按住左
  1200. @self.add_base
  1201. def action(*args, **kwargs):
  1202. nonlocal self
  1203. self.element_dict[chains][0].click_and_hold(
  1204. self.element_dict[element_value][index]
  1205. )
  1206. self.add_func(
  1207. f"[{chains}]click_and_hold>[{element_value}][{index}]", action
  1208. ) # 添加func
  1209. @plugin_class_loading(get_path(r'template/crawler'))
  1210. class PageParserChainsMouse(PageParserFunc):
  1211. def action_release(self, chains, element_value, index, **kwargs): # 松开左键
  1212. @self.add_base
  1213. def action(*args, **kwargs):
  1214. nonlocal self
  1215. self.element_dict[chains][0].release(
  1216. self.element_dict[element_value][index]
  1217. )
  1218. self.add_func(f"[{chains}]release>[{element_value}][{index}]", action) # 添加func
  1219. def action_drag_and_drop(
  1220. self, chains, element_value, index, element_value2, index2, **kwargs
  1221. ): # 拽托、松开
  1222. @self.add_base
  1223. def action(*args, **kwargs):
  1224. nonlocal self
  1225. self.element_dict[chains][0].drag_and_drop(
  1226. self.element_dict[element_value][index],
  1227. self.element_dict[element_value2][index2],
  1228. )
  1229. self.add_func(
  1230. f"[{chains}]drag_and_drop>[{element_value}][{index}]", action
  1231. ) # 添加func
  1232. def action_move(self, chains, element_value, index, **kwargs): # 移动鼠标
  1233. @self.add_base
  1234. def action(*args, **kwargs):
  1235. nonlocal self
  1236. self.element_dict[chains][0].move_to_element(
  1237. self.element_dict[element_value][index]
  1238. )
  1239. self.add_func(
  1240. f"[{chains}]drag_and_drop>[{element_value}][{index}]", action
  1241. ) # 添加func
  1242. @plugin_class_loading(get_path(r'template/crawler'))
  1243. class PageParserChainsKeys(PageParserFunc):
  1244. @staticmethod
  1245. def special_keys(key: str, is_special_keys):
  1246. if is_special_keys:
  1247. return keys_name_dict.get(key.lower(), key), f"[{key.upper()}]"
  1248. else:
  1249. return key, key
  1250. def action_key_down(
  1251. self, chains, key, element_value, index, is_special_keys, **kwargs
  1252. ): # down
  1253. new_key, key = self.special_keys(key, is_special_keys)
  1254. @self.add_base
  1255. def action(*args, **kwargs):
  1256. nonlocal self
  1257. self.element_dict[chains][0].key_down(
  1258. new_key, self.element_dict[element_value][index]
  1259. )
  1260. self.add_func(
  1261. f"[{chains}]key_down>{key}:[{element_value}][{index}]", action
  1262. ) # 添加func
  1263. def action_key_up(
  1264. self, chains, key, element_value, index, is_special_keys, **kwargs
  1265. ): # down
  1266. new_key, key = self.special_keys(key, is_special_keys)
  1267. @self.add_base
  1268. def action(*args, **kwargs):
  1269. nonlocal self
  1270. self.element_dict[chains][0].key_up(
  1271. new_key, self.element_dict[element_value][index]
  1272. )
  1273. self.add_func(
  1274. f"[{chains}]key_up>{key}:[{element_value}][{index}]", action
  1275. ) # 添加func
  1276. def action_send_keys_to_element(
  1277. self, chains, key, element_value, index, is_special_keys, **kwargs
  1278. ):
  1279. new_key, key = self.special_keys(key, is_special_keys)
  1280. @self.add_base
  1281. def action(*args, **kwargs):
  1282. nonlocal self
  1283. self.element_dict[chains][0].send_keys_to_element(
  1284. self.element_dict[element_value][index], new_key
  1285. )
  1286. self.add_func(
  1287. f"[{chains}]sent>{key}:[{element_value}][{index}]", action
  1288. ) # 添加func
  1289. def action_send_keys(self, chains, key, is_special_keys, **kwargs): # 发送到焦点元素
  1290. new_key, key = self.special_keys(key, is_special_keys)
  1291. @self.add_base
  1292. def action(*args, **kwargs):
  1293. nonlocal self
  1294. self.element_dict[chains][0].send_keys(new_key)
  1295. self.add_func(f"[{chains}].sent>{key}", action) # 添加func
  1296. @plugin_class_loading(get_path(r'template/crawler'))
  1297. class PageParserChains(PageParserChainsWindow, PageParserClick, PageParserChainsMouse,
  1298. PageParserChainsKeys):
  1299. def make_action_chains(self, **kwargs): # 创建动作链
  1300. @self.add_base
  1301. def action(num, name, *args, **kwargs):
  1302. nonlocal self
  1303. self.element_dict[f"{name}[{num}]"] = [ActionChains(self.browser)]
  1304. self.add_func(f"make_ActionChains", action) # 添加func
  1305. def action_run(self, chains, run_time=1, **kwargs): # 执行
  1306. @self.add_base
  1307. def action(*args, **kwargs):
  1308. nonlocal self
  1309. self.element_dict[chains][0].perform()
  1310. sleep(run_time)
  1311. self.add_func(f"[{chains}].run<{run_time}s", action) # 添加func
  1312. for i in range(1, 13): # F1 - F12按键
  1313. keys_name_dict[f"f{i}"] = eval(f"Keys.F{i}", {'Keys': Keys})
  1314. data_base = DatabaseController()