1
0

template.py 53 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630
  1. import bs4
  2. import hashlib
  3. import os
  4. import re as regular
  5. import threading
  6. import time
  7. from abc import ABCMeta, abstractmethod
  8. from time import sleep
  9. import logging
  10. from selenium import webdriver
  11. from selenium.webdriver import ActionChains
  12. from selenium.webdriver.common.keys import Keys
  13. from selenium.common.exceptions import InvalidSessionIdException, WebDriverException
  14. import requests
  15. from system import plugin_class_loading, get_path, basicConfig
  16. logging.basicConfig(**basicConfig)
  17. keys_name_dict = {
  18. "ctrl": Keys.CONTROL,
  19. "shift": Keys.SHIFT,
  20. "tab": Keys.TAB,
  21. "left_ctrl": Keys.LEFT_CONTROL,
  22. "left_shift": Keys.LEFT_SHIFT,
  23. "left_alt": Keys.LEFT_ALT,
  24. "ALT": Keys.ALT,
  25. "enter": Keys.ENTER,
  26. "return": Keys.RETURN,
  27. "backspace": Keys.BACKSPACE,
  28. "del": Keys.DELETE,
  29. "pgup": Keys.PAGE_UP,
  30. "pgdn": Keys.PAGE_DOWN,
  31. "home": Keys.HOME,
  32. "end": Keys.END,
  33. "esc": Keys.CANCEL,
  34. "insert": Keys.INSERT,
  35. "meta": Keys.META,
  36. "up": Keys.UP,
  37. "down": Keys.DOWN,
  38. "right": Keys.RIGHT,
  39. "left": Keys.LEFT,
  40. } # 键-值映射
  41. class PageParserError(Exception):
  42. pass
  43. class UrlError(Exception):
  44. pass
  45. class CookiesError(Exception):
  46. pass
  47. class Database(metaclass=ABCMeta):
  48. @abstractmethod
  49. def __str__(self):
  50. pass
  51. @abstractmethod
  52. def close(self):
  53. pass
  54. @abstractmethod
  55. def add_new(self, data):
  56. pass
  57. @abstractmethod
  58. def remove(self):
  59. pass
  60. @abstractmethod
  61. def out_file(self, out_dir):
  62. pass
  63. class CoTanDB(Database):
  64. def __init__(self, name):
  65. self.dir = rf"{os.getcwd()}{os.sep}Database_dir{os.sep}{name}.cotanDB" # 创建DB文件
  66. self.file = open(self.dir, "r+" if os.path.exists(self.dir) else "w+")
  67. self.id = 0
  68. self.name = name
  69. for _ in self.file.readlines():
  70. self.id += 1
  71. def __str__(self):
  72. return self.name
  73. def close(self):
  74. try:
  75. self.file.close()
  76. except IOError:
  77. pass
  78. def add_new(self, data):
  79. data_str = str(self.id)
  80. for i in data:
  81. data_str += "," + str(i)
  82. data_str += "\n"
  83. self.file.write(data_str)
  84. self.file.flush()
  85. self.id += 1
  86. def remove(self):
  87. self.file.close()
  88. os.remove(self.dir)
  89. def out_file(self, out_dir):
  90. with open(out_dir + fr"{os.sep}{self.name}.contanDB", "w") as f:
  91. with open(self.dir) as g:
  92. f.write(g.read())
  93. class DatabaseControllerBase:
  94. def __init__(self):
  95. self.database = {}
  96. class AddDatabase(DatabaseControllerBase):
  97. def add_database(self, name): # 添加数据表
  98. self.database[name] = CoTanDB(name)
  99. class DatabaseControllerCustom(metaclass=ABCMeta):
  100. @abstractmethod
  101. def close(self, name):
  102. pass
  103. @abstractmethod
  104. def close_all(self):
  105. pass
  106. @abstractmethod
  107. def rm_database(self, name):
  108. pass
  109. @abstractmethod
  110. def out(self, name, save_dir):
  111. pass
  112. @abstractmethod
  113. def return_database(self):
  114. pass
  115. class DatabaseController(AddDatabase, DatabaseControllerCustom): # data base控制器
  116. def add_new(self, name, data): # 添加新内容
  117. database = self.database.get(name)
  118. if database is None:
  119. self.add_database(name)
  120. database = self.database.get(name)
  121. database.add_new(data)
  122. def close(self, name): # 关闭数据表
  123. try:
  124. self.database[name].close()
  125. del self.database[name]
  126. except IndexError:
  127. pass
  128. def close_all(self): # 关闭所有数据表
  129. for i in self.database:
  130. self.database[i].close()
  131. self.database = {}
  132. def rm_database(self, name): # 删除数据表
  133. self.database[name].remove()
  134. del self.database[name]
  135. def out(self, name, save_dir): # 输出数据表
  136. self.database[name].out_file(save_dir)
  137. def return_database(self):
  138. return list(self.database.keys())
  139. class LogBase(metaclass=ABCMeta):
  140. @abstractmethod
  141. def write(self, data):
  142. pass
  143. @abstractmethod
  144. def close(self):
  145. pass
  146. class Log(LogBase):
  147. def __init__(self, log_dir):
  148. self.log_dir = log_dir
  149. self.log_file = open(
  150. log_dir + f"{os.sep}log.coTanLog",
  151. "r+" if os.path.exists(log_dir + "log.coTanLog") else "w+",
  152. )
  153. def write(self, data):
  154. self.log_file.write(
  155. f"[{time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))}] "
  156. + data
  157. + "\n"
  158. )
  159. self.log_file.flush()
  160. def close(self):
  161. self.log_file.close()
  162. class PageBase:
  163. def __init__(self, time_out):
  164. self.url = ""
  165. self.user_agent = ""
  166. self.mode = "PAGE"
  167. self.time_out = time_out
  168. def __str__(self):
  169. return f"[{self.time_out}s]{self.mode}-{self.url}:UA>{self.user_agent}"
  170. @abstractmethod
  171. def init(self, *args, **kwargs):
  172. pass
  173. class __RequestsBase(PageBase):
  174. def init(self, user_agent, url, cookies):
  175. if user_agent == "":
  176. user_agent = (
  177. f'--user-agent ="Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
  178. f'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36 Edg/80.0.361.66"'
  179. )
  180. self.user_agent = user_agent
  181. self.headers = {
  182. "Accept": "text/html, application/xhtml+xml, image/jxr, */*",
  183. "Accept - Encoding": "gzip, deflate",
  184. "Accept-Language": "zh-Hans-CN, zh-Hans; q=0.5",
  185. "Connection": "Keep-Alive",
  186. "User-Agent": user_agent,
  187. }
  188. self.url = url
  189. self.cookies = cookies
  190. self.new = True
  191. @plugin_class_loading(get_path(r'template/crawler'))
  192. class UrlGet(__RequestsBase): # 通过requests的post请求
  193. def __init__(self, url, time_out, user_agent="", cookies=None, **kwargs):
  194. super(UrlGet, self).__init__(time_out)
  195. self.mode = "simplify_get"
  196. self.requests = requests.get
  197. self.init(user_agent, url, cookies)
  198. @plugin_class_loading(get_path(r'template/crawler'))
  199. class UrlPost(__RequestsBase): # 通过requests的post请求
  200. def __init__(self, url, data, time_out, user_agent="", cookies=None, **kwargs):
  201. super(UrlPost, self).__init__(time_out)
  202. self.mode = "post"
  203. self.data = data
  204. self.requests = requests.post
  205. self.init(user_agent, url, cookies)
  206. def __str__(self):
  207. return super(UrlPost, self).__str__() + f";data>{self.data}"
  208. @plugin_class_loading(get_path(r'template/crawler'))
  209. class UrlPage(PageBase):
  210. def __init__(
  211. self,
  212. url,
  213. time_out,
  214. first_run=False,
  215. head=False,
  216. no_plugins=True,
  217. no_js=False,
  218. no_java=False,
  219. no_img=False,
  220. user_agent="",
  221. cookies=None,
  222. new=False,
  223. down_load_dir="",
  224. **kwargs,
  225. ):
  226. super(UrlPage, self).__init__(time_out)
  227. self.url = url
  228. self.mode = "get"
  229. self.options = webdriver.ChromeOptions()
  230. self.cookies = cookies # cookies存储位置
  231. self.new = new # 新键页面or新键浏览器
  232. self.down_load_dir = down_load_dir
  233. self.init(first_run, head, no_plugins, no_js, no_java, no_img, user_agent)
  234. def init(self, first_run, head, no_plugins, no_js, no_java, no_img, user_agent):
  235. self.options.add_argument("disable-infobars") # 不显示
  236. prefs = {
  237. "profile.default_content_settings.popups": 0,
  238. "download.default_directory": self.down_load_dir,
  239. }
  240. self.options.add_experimental_option("prefs", prefs) # 下载设置
  241. if first_run:
  242. self.options.add_argument("-first run")
  243. if head: # 无头设置
  244. self.options.add_argument("--headless")
  245. self.options.add_argument("--disable-gpu")
  246. if no_plugins:
  247. self.options.add_argument("--disable-plugins")
  248. if no_js:
  249. self.options.add_argument("--disable-javascript")
  250. if no_java:
  251. self.options.add_argument("--disable-java")
  252. if no_img:
  253. self.options.add_argument("blink-settings=imagesEnabled=false")
  254. if user_agent == "":
  255. user_agent = (
  256. f'user-agent ="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
  257. f'Chrome/80.0.3987.132 Safari/537.36"'
  258. )
  259. # self.options.add_argument(f'--user-agent ="{UA}"')
  260. self.user_agent = user_agent
  261. def __str__(self):
  262. return f"{self.mode}-{self.url}:UA>{self.user_agent}"
  263. class Urlbase(metaclass=ABCMeta):
  264. url_count = 0 # url处理器个数
  265. def __init__(self, dic=f"", dic_run=f""):
  266. Urlbase.url_count += 1
  267. self.save_dir = dic
  268. dic += f"{os.sep}url[{Urlbase.url_count}].cot_url"
  269. dic_run += f"{os.sep}url_run[{Urlbase.url_count}].cot_url"
  270. self.dir = dic
  271. self.dir_run = dic_run
  272. self.file = open(dic, "a") # 写入url_history的文件
  273. self.file_run = open(dic_run, "a") # 写入已读url文件
  274. self.url_list = [] # 待读url
  275. self.url_history = [] # url历史
  276. self.filter = {} # 过滤函数
  277. @abstractmethod
  278. def filter_func(self, url, **kwargs):
  279. pass
  280. @abstractmethod
  281. def add_filter_func(self, func, name):
  282. pass
  283. @abstractmethod
  284. def del_filter_func(self, index):
  285. pass
  286. @abstractmethod
  287. def return_filter_func(self):
  288. pass
  289. @abstractmethod
  290. def close(self):
  291. pass
  292. @abstractmethod
  293. def out_url_history(self, url):
  294. pass
  295. @abstractmethod
  296. def out_url_run(self, url):
  297. pass
  298. @plugin_class_loading(get_path(r'template/crawler'))
  299. class UrlFile(Urlbase, metaclass=ABCMeta):
  300. def close(self):
  301. self.file.close()
  302. self.file_run.close()
  303. def out_url_history(self, url): # 输出url历史
  304. self.file.write(f"{url}\n")
  305. self.file.flush()
  306. def out_url_run(self, url): # 输出已经运行的url
  307. self.file_run.write(f"{url}\n")
  308. self.file_run.flush()
  309. @plugin_class_loading(get_path(r'template/crawler'))
  310. class UrlAdd(Urlbase, metaclass=ABCMeta):
  311. def filter_func(self, url, **kwargs): # url过滤系统
  312. for i in self.filter:
  313. if not self.filter[i](url):
  314. return False
  315. return True
  316. def add_filter_func(self, func, name): # 添加过滤函数
  317. self.filter[name] = func
  318. def del_filter_func(self, index): # 删除过滤函数
  319. del self.filter[list(self.filter.keys())[index]]
  320. def return_filter_func(self):
  321. return list(self.filter.keys())
  322. @plugin_class_loading(get_path(r'template/crawler'))
  323. class UrlReturn(Urlbase, metaclass=ABCMeta):
  324. def del_url(self, index): # 删除url
  325. self.out_url_run(f"DELETE {self.url_list[index]}")
  326. del self.url_list[index]
  327. def get_url(self): # 取得url
  328. url_page = self.url_list[0]
  329. self.out_url_run(url_page.url)
  330. del self.url_list[0]
  331. return url_page
  332. def is_finish(self):
  333. return len(self.url_list) == 0
  334. def add_url(self, url, func, data=None, **kwargs): # 添加url
  335. if func == "":
  336. func = "get"
  337. if func == "get":
  338. url_ = url
  339. else:
  340. url_ = url + str(data)
  341. if url_ not in self.url_history and self.filter_func(url, func=func): # 1.url不存在历史,2.url满足筛选条件
  342. if func == "get":
  343. self.url_list.append(
  344. UrlPage(url=url, **kwargs, down_load_dir=self.dir)
  345. ) # 添加到待取得url
  346. elif func == "simplify_get":
  347. self.url_list.append(
  348. UrlGet(url=url, **kwargs, down_load_dir=self.dir)
  349. ) # 添加到待取得url
  350. else:
  351. self.url_list.append(UrlPost(url=url, data=data, **kwargs)) # 添加到待取得url
  352. self.url_history.append(url_) # 添加到历史url
  353. self.out_url_history(url_) # 输出历史url
  354. return True # 写入成功
  355. return False # 写入失败
  356. class SeleniumBase(metaclass=ABCMeta):
  357. @abstractmethod
  358. def selenium_mode(self, func_cookie, url):
  359. pass
  360. class RequestsBase(metaclass=ABCMeta):
  361. @abstractmethod
  362. def requests_mode(self, func_cookie, url):
  363. pass
  364. class PagedownloaderBase(SeleniumBase, RequestsBase, metaclass=ABCMeta):
  365. downloader_count = 0
  366. def __init__(self, url, dic=""):
  367. self.url = url
  368. self.dir = dic
  369. self.log = Log(dic)
  370. PagedownloaderBase.downloader_count += 1
  371. self.page_source_dict = {} # 页面保存信息
  372. self.cookie_Thread = None # 子进程
  373. self.browser = None
  374. self.cookie_dict = {}
  375. self.cookie_dict_list = {} # sele的cookies
  376. self.last_mode = None
  377. def set_page_parser(self, parser):
  378. self.parser = parser
  379. @abstractmethod
  380. def monitoring_add_cookies(self, cookies):
  381. pass
  382. @abstractmethod
  383. def monitoring_clear_cookier(self):
  384. pass
  385. @plugin_class_loading(get_path(r'template/crawler'))
  386. class PageDownloaderRun(PagedownloaderBase, metaclass=ABCMeta):
  387. def close(self):
  388. self.log.close()
  389. def stop(self):
  390. self.break_ = False
  391. if self.last_mode is not None:
  392. try:
  393. self.browser.quit()
  394. except InvalidSessionIdException:
  395. pass
  396. self.last_mode = None
  397. def start_to_run(self, *args, func_cookie): # 用get请求url ->得到一个页面信息
  398. self.break_ = False
  399. self.page_source_dict = {}
  400. self.url_text = self.url.get_url() # 获取一个url
  401. url = self.url_text.url
  402. try:
  403. if self.url_text.mode == "get":
  404. self.selenium_mode(func_cookie, url)
  405. else: # requests模式
  406. self.requests_mode(func_cookie, url)
  407. except BaseException:
  408. raise CookiesError
  409. finally:
  410. self.last_mode = self.url_text.mode
  411. self.parser.browser = self.browser
  412. self.parser.init(url)
  413. return self.browser
  414. @plugin_class_loading(get_path(r'template/crawler'))
  415. class PageDownloaderCookies(PagedownloaderBase, metaclass=ABCMeta):
  416. def monitoring_del_cookies(self, name): # 删除指定cookies
  417. self.browser.delete_cookie(name)
  418. def monitoring_clear_cookier(self): # 清空cookies
  419. self.browser.delete_all_cookies()
  420. def monitoring_add_cookies(self, cookies: dict): # 新增cookies
  421. self.browser.add_cookie(cookies)
  422. def monitoring_update_cookies(self, name, cookies: dict):
  423. cookies_list = self.browser.get_cookies()
  424. for i in cookies_list:
  425. if i.get("name", None) == name:
  426. self.browser.delete_cookie(name) # 删除原来cookies
  427. i.update(cookies)
  428. self.browser.add_cookie(i)
  429. return
  430. raise Exception
  431. @plugin_class_loading(get_path(r'template/crawler'))
  432. class PageDownloaderRequests(PageDownloaderRun, metaclass=ABCMeta):
  433. def requests_start_cookies(self, func_cookie, url):
  434. self.cookie_dict[url] = requests.utils.dict_from_cookiejar(
  435. self.browser.cookies
  436. ) # 保存cookies
  437. func_cookie([self.cookie_dict[url]])
  438. def requests_run(self, parameters, url):
  439. self.browser = self.url_text.requests(
  440. url,
  441. headers=self.url_text.headers,
  442. timeout=self.url_text.time_out,
  443. **parameters,
  444. )
  445. def requests_data(self, parameters):
  446. if self.url_text.mode == "post":
  447. parameters["data"] = self.url_text.data
  448. return parameters
  449. def requests_cookies(self, func_cookie):
  450. try:
  451. parameters = {"cookies": self.cookie_dict[self.url_text.cookies]}
  452. except KeyError:
  453. parameters = {}
  454. func_cookie([])
  455. else:
  456. func_cookie([parameters["cookies"]])
  457. return parameters
  458. def requests_mode(self, func_cookie, url):
  459. parameters = self.requests_cookies(func_cookie)
  460. parameters = self.requests_data(parameters)
  461. self.requests_run(parameters, url)
  462. self.requests_start_cookies(func_cookie, url)
  463. @plugin_class_loading(get_path(r'template/crawler'))
  464. class PageDownloaderSelenium(PageDownloaderRun, metaclass=ABCMeta):
  465. def selenium_quit(self):
  466. try:
  467. self.browser.quit()
  468. except InvalidSessionIdException:
  469. pass
  470. def selenium_cookies(self):
  471. try:
  472. if not self.url_text.new:
  473. raise UrlError
  474. cookies_list = self.cookie_dict_list[self.url_text.cookies]
  475. except (UrlError, KeyError):
  476. pass
  477. else:
  478. self.monitoring_clear_cookier()
  479. try:
  480. for i in cookies_list:
  481. self.monitoring_add_cookies(i)
  482. except WebDriverException:
  483. pass
  484. def start_selenium(self, quit_=True):
  485. if quit_:
  486. self.selenium_quit()
  487. self.browser = webdriver.Chrome(chrome_options=self.url_text.options)
  488. def selenium_run(self, url):
  489. self.browser.set_page_load_timeout(self.url_text.time_out) # 设置页面加载超时
  490. self.browser.set_script_timeout(self.url_text.time_out) # 设置页面异步js执行超时
  491. self.browser.get(url)
  492. def selenium_start_cookies(self, func_cookie, url):
  493. self.break_ = True
  494. def update_cookie():
  495. nonlocal self
  496. while self.break_:
  497. try:
  498. cookies = self.browser.get_cookies()
  499. func_cookie(cookies) # 与GUI通信显示cookie
  500. self.cookie_dict[url] = cookies
  501. time.sleep(0.5)
  502. except WebDriverException:
  503. pass
  504. self.cookie_Thread = threading.Thread(target=update_cookie)
  505. self.cookie_Thread.start()
  506. def selenium_mode(self, func_cookie, url):
  507. if self.url_text.new and self.last_mode == "get": # 重新启动
  508. self.start_selenium()
  509. elif self.last_mode is None:
  510. self.start_selenium(False)
  511. try:
  512. self.selenium_run(url)
  513. except WebDriverException:
  514. self.start_selenium()
  515. self.selenium_run(url)
  516. self.selenium_cookies()
  517. self.selenium_start_cookies(func_cookie, url)
  518. class PageParserBase:
  519. def __init__(self, downloader):
  520. self.downloader = downloader
  521. self.downloader.set_page_parser(self)
  522. self.func_list = []
  523. self.func_dict = {}
  524. self.n = 0
  525. self.init()
  526. def init(self, url=""):
  527. self.element_dict = {} # 记录属性的名字
  528. self.url_text = url
  529. @staticmethod
  530. def add_base(func): # 装饰器
  531. def wrap(num=None, name=None, *args, **kwargs):
  532. try:
  533. func(num=num, name=name, *args, **kwargs)
  534. return True, ''
  535. except BaseException as e:
  536. return False, str(e)
  537. return wrap
  538. @plugin_class_loading(get_path(r'template/crawler'))
  539. class PageParserFunc(PageParserBase):
  540. def tra_func(self):
  541. self.func_list = []
  542. self.func_dict = {}
  543. self.n = 0
  544. def add_func(self, name, func):
  545. self.func_list.append(f"{name}[{self.n}]")
  546. self.func_dict[f"{name}[{self.n}]"] = func
  547. self.n += 1
  548. def del_func(self, index, end=False):
  549. if end:
  550. index = len(self.func_list) - index - 1
  551. del self.func_dict[self.func_list[index]]
  552. self.func_list[index] = "Func_have_been_del"
  553. self.func_dict["Func_have_been_del"] = lambda *args, **kwargs: None
  554. def return_func(self, only=True):
  555. if only:
  556. return self.func_list.copy()
  557. else:
  558. return [
  559. f"var[{index}]@ {i}" for index, i in enumerate(self.func_list.copy())
  560. ]
  561. @plugin_class_loading(get_path(r'template/crawler'))
  562. class PageParserFind(PageParserFunc):
  563. def find_id(self, id_, not_all=False, **kwargs):
  564. @self.add_base
  565. def find(num, name, *args, **kwargs):
  566. nonlocal self, id_
  567. if not_all:
  568. self.element_dict[f"{name}[{num}]"] = [
  569. self.browser.find_element_by_id(id_)
  570. ] # 返回必须是list
  571. else:
  572. self.element_dict[f"{name}[{num}]"] = self.browser.find_elements_by_id(id_)
  573. self.add_func(f"find_ID:{id_}", find) # 添加func
  574. def find_class(self, class_name, not_all=False, **kwargs):
  575. @self.add_base
  576. def find(num, name, *args, **kwargs):
  577. nonlocal self, class_name
  578. self.browser = self.browser
  579. if not_all:
  580. self.element_dict[f"{name}[{num}]"] = [
  581. self.browser.find_element_by_class_name(class_name)
  582. ] # 返回必须是list
  583. else:
  584. self.element_dict[
  585. f"{name}[{num}]"
  586. ] = self.browser.find_elements_by_class_name(
  587. class_name
  588. ) # 返回必须是list
  589. self.add_func(f"find_class:{class_name}", find) # 添加func
  590. def find_name(self, name_, not_all=False, **kwargs):
  591. @self.add_base
  592. def find(num, name, *args, **kwargs):
  593. nonlocal self, name_
  594. if not_all:
  595. self.element_dict[f"{name}[{num}]"] = [
  596. self.browser.find_element_by_name(name_)
  597. ] # 返回必须是list
  598. else:
  599. self.element_dict[f"{name}[{num}]"] = self.browser.find_elements_by_name(
  600. name_
  601. ) # 返回必须是list
  602. self.add_func(f"find_name:{name_}", find) # 添加func
  603. def find_xpath(self, xpath, not_all=False, **kwargs):
  604. @self.add_base
  605. def find(num, name, *args, **kwargs):
  606. nonlocal self, xpath
  607. if not_all:
  608. self.element_dict[f"{name}[{num}]"] = [
  609. self.browser.find_element_by_xpath(xpath)
  610. ] # 返回必须是list
  611. else:
  612. self.element_dict[f"{name}[{num}]"] = self.browser.find_elements_by_xpath(
  613. xpath
  614. ) # 返回必须是list
  615. self.add_func(f"find_xpath:{xpath}", find) # 添加func
  616. def find_css(self, css_selector, not_all=False, **kwargs):
  617. @self.add_base
  618. def find(num, name, *args, **kwargs):
  619. nonlocal self, css_selector
  620. if not_all:
  621. self.element_dict[f"{name}[{num}]"] = [
  622. self.browser.find_element_by_css_selector(css_selector)
  623. ] # 返回必须是list
  624. else:
  625. self.element_dict[
  626. f"{name}[{num}]"
  627. ] = self.browser.find_elements_by_css_selector(
  628. css_selector
  629. ) # 返回必须是list
  630. self.add_func(f"find_css:{css_selector}", find) # 添加func
  631. def find_tag_name(self, tag_name, not_all=False, **kwargs):
  632. @self.add_base
  633. def find(num, name, *args, **kwargs):
  634. nonlocal self, tag_name
  635. if not_all:
  636. self.element_dict[f"{name}[{num}]"] = [
  637. self.browser.find_element_by_tag_name(tag_name)
  638. ] # 返回必须是list
  639. else:
  640. self.element_dict[f"{name}[{num}]"] = self.browser.find_elements_by_tag_name(
  641. tag_name
  642. ) # 返回必须是list
  643. self.add_func(f"find_tagName:{tag_name}", find) # 添加func\
  644. def find_link_text(self, link_text, not_all=False, **kwargs): # 匹配link
  645. @self.add_base
  646. def find(num, name, *args, **kwargs):
  647. nonlocal self, link_text
  648. if not_all:
  649. self.element_dict[f"{name}[{num}]"] = [
  650. self.browser.find_element_by_link_text(link_text)
  651. ] # 返回必须是list
  652. else:
  653. self.element_dict[
  654. f"{name}[{num}]"
  655. ] = self.browser.find_elements_by_link_text(
  656. link_text
  657. ) # 返回必须是list
  658. self.add_func(f"find_link_text:{link_text}", find) # 添加func
  659. def find_partial_link_text(
  660. self, partial_link_text, not_all=False, **kwargs
  661. ): # 模糊匹配
  662. @self.add_base
  663. def find(num, name, *args, **kwargs):
  664. nonlocal self, partial_link_text
  665. if not_all:
  666. self.element_dict[f"{name}[{num}]"] = [
  667. self.browser.find_element_by_partial_link_text(partial_link_text)
  668. ] # 返回必须是list
  669. else:
  670. self.element_dict[f"{name}[{num}]"] = [
  671. self.browser.find_element_by_partial_link_text(partial_link_text)
  672. ] # 返回必须是list
  673. self.add_func(f"find_partial_link_text:{partial_link_text}", find) # 添加func
  674. def find_switch_to_alert(self, *args, **kwargs): # 定位弹出框
  675. @self.add_base
  676. def find(num, name, *args, **kwargs):
  677. nonlocal self
  678. self.element_dict[f"{name}[{num}]"] = [self.browser.switch_to.alert()]
  679. self.add_func(f"find_alert", find) # 添加func
  680. def find_switch_to_active_element(self, *args, **kwargs): # 定位焦点元素
  681. @self.add_base
  682. def find(num, name, *args, **kwargs):
  683. nonlocal self
  684. self.element_dict[f"{name}[{num}]"] = [self.browser.switch_to.active_element()]
  685. self.add_func(f"active_element", find) # 添加func
  686. def find_switch_to_frame(self, reference, is_id=False, *args, **kwargs): # 定位Frame
  687. @self.add_base
  688. def find(num, name, *args, **kwargs):
  689. nonlocal self, reference, is_id
  690. if reference is None:
  691. self.element_dict[f"{name}[{num}]"] = [
  692. self.browser.default_content()
  693. ] # 回到主文档
  694. elif reference == "":
  695. self.element_dict[f"{name}[{num}]"] = [self.browser.parent_frame()] # 回到父文档
  696. else:
  697. if is_id:
  698. reference = int(reference)
  699. self.element_dict[f"{name}[{num}]"] = [
  700. self.browser.switch_to.frame(str(reference))
  701. ] # 定位进入文档
  702. func_name = {None: "主文档", "": "父文档"}.get(reference, reference)
  703. self.add_func(f"find_frame:{func_name}", find) # 添加func
  704. @plugin_class_loading(get_path(r'template/crawler'))
  705. class PageParserActionListBox(PageParserFunc):
  706. def deselect_by_index(
  707. self, element_value, deselect, index=0, **kwargs
  708. ): # 根据index取消选择
  709. @self.add_base
  710. def action(*args, **kwargs):
  711. nonlocal self
  712. self.element_dict[element_value][index].deselect_by_index(int(deselect))
  713. self.add_func(
  714. f"deselect_by_index:{deselect}>{element_value}[{index}]", action
  715. ) # 添加func
  716. def deselect_by_text(
  717. self, element_value, deselect, index=0, **kwargs
  718. ): # 根据text取消选择
  719. @self.add_base
  720. def action(*args, **kwargs):
  721. nonlocal self
  722. self.element_dict[element_value][index].deselect_by_visible_text(deselect)
  723. self.add_func(
  724. f"deselect_by_text:{deselect}>{element_value}[{index}]", action
  725. ) # 添加func
  726. def deselect_by_value(
  727. self, element_value, deselect, index=0, **kwargs
  728. ): # 根据value取消选择
  729. @self.add_base
  730. def action(*args, **kwargs):
  731. nonlocal self
  732. self.element_dict[element_value][index].deselect_by_value(deselect)
  733. self.add_func(
  734. f"deselect_by_value:{deselect}>{element_value}[{index}]", action
  735. ) # 添加func
  736. def select_by_index(self, element_value, deselect, index=0, **kwargs): # 根据index选择
  737. @self.add_base
  738. def action(*args, **kwargs):
  739. nonlocal self
  740. self.element_dict[element_value][index].select_by_index(int(deselect))
  741. self.add_func(
  742. f"select_by_index:{deselect}>{element_value}[{index}]", action
  743. ) # 添加func
  744. def select_by_text(self, element_value, deselect, index=0, **kwargs): # 根据text选择
  745. @self.add_base
  746. def action(*args, **kwargs):
  747. nonlocal self
  748. self.element_dict[element_value][index].select_by_visible_text(deselect)
  749. self.add_func(
  750. f"select_by_text:{deselect}>{element_value}[{index}]", action
  751. ) # 添加func
  752. def select_by_value(self, element_value, deselect, index=0, **kwargs): # 根据value选择
  753. @self.add_base
  754. def action(*args, **kwargs):
  755. nonlocal self
  756. self.element_dict[element_value][index].select_by_value(deselect)
  757. self.add_func(
  758. f"select_by_value:{deselect}>{element_value}[{index}]", action
  759. ) # 添加func
  760. @plugin_class_loading(get_path(r'template/crawler'))
  761. class PageParserAction(PageParserFunc):
  762. def send_keys(self, text, element_value, index=0, **kwargs): # 输入文字
  763. @self.add_base
  764. def action(*args, **kwargs):
  765. nonlocal self
  766. self.element_dict[element_value][index].send_keys(text)
  767. self.add_func(f"sent_text:{text}>{element_value}[{index}]", action) # 添加func
  768. def authentication(
  769. self, user, passwd, element_value, index=0, **kwargs
  770. ): # 输入验证(User&Password)
  771. @self.add_base
  772. def action(*args, **kwargs):
  773. nonlocal self
  774. self.element_dict[element_value][index].authenticate(user, passwd)
  775. self.add_func(
  776. f"Authentication:{user};{passwd}>{element_value}[{index}]", action
  777. ) # 添加func
  778. def clear(self, element_value, index=0, **kwargs): # 清空文本
  779. @self.add_base
  780. def action(*args, **kwargs):
  781. nonlocal self
  782. self.element_dict[element_value][index].clear()
  783. self.add_func(f"clear_text>{element_value}[{index}]", action) # 添加func
  784. def click(self, element_value, index=0, **kwargs): # 点击按钮
  785. @self.add_base
  786. def action(*args, **kwargs):
  787. nonlocal self
  788. self.element_dict[element_value][index].click()
  789. self.add_func(f"click>{element_value}[{index}]", action) # 添加func
  790. def accept(self, element_value, index=0, **kwargs): # 点击确定(弹出框)
  791. @self.add_base
  792. def action(*args, **kwargs):
  793. nonlocal self
  794. self.element_dict[element_value][index].accept()
  795. self.add_func(f"accept>{element_value}[{index}]", action) # 添加func
  796. def dismiss(self, element_value, index=0, **kwargs): # 点击取消(弹出框)
  797. @self.add_base
  798. def action(*args, **kwargs):
  799. nonlocal self
  800. self.element_dict[element_value][index].dismiss()
  801. self.add_func(f"dismiss>{element_value}[{index}]", action) # 添加func
  802. def submit(self, element_value, index=0, **kwargs): # 提交表单
  803. @self.add_base
  804. def action(*args, **kwargs):
  805. nonlocal self
  806. self.element_dict[element_value][index].submit()
  807. self.add_func(f"submit>{element_value}[{index}]", action) # 添加func
  808. def run_js(self, js, **kwargs):
  809. @self.add_base
  810. def action(num, name, *args, **kwargs):
  811. nonlocal self
  812. get = self.browser.execute_script(js)
  813. if hasattr(get, "__getitem__"): # 可切片
  814. self.element_dict[f"{name}[{num}]"] = get # 返回必须是list
  815. else:
  816. self.element_dict[f"{name}[{num}]"] = [get]
  817. self.add_func(f"run_js:{js}", action)
  818. class PageParserAutomation(PageParserFind, PageParserActionListBox, PageParserAction):
  819. pass
  820. @plugin_class_loading(get_path(r'template/crawler'))
  821. class PageParserCookies(PageParserFunc):
  822. def del_all_cookies(self, **kwargs): # 删除所有曲奇
  823. @self.add_base
  824. def action(*args, **kwargs):
  825. nonlocal self
  826. self.browser.delete_all_cookies()
  827. self.add_func(f"del_all_cookies", action)
  828. def del_cookies(self, cookies_name, **kwargs): # 删除指定曲奇
  829. @self.add_base
  830. def action(*args, **kwargs):
  831. nonlocal self
  832. self.browser.delete_cookie(cookies_name)
  833. self.add_func(f"del_cookies:{cookies_name}", action)
  834. def add_cookies(self, cookies, **kwargs): # 添加指定曲奇
  835. @self.add_base
  836. def action(*args, **kwargs):
  837. nonlocal self
  838. self.browser.add_cookie(cookies)
  839. self.add_func(f"add_cookies:{cookies}", action)
  840. def update_cookies(self, cookies_name, cookies, **kwargs): # 更新曲奇
  841. @self.add_base
  842. def action(*args, **kwargs):
  843. nonlocal self
  844. now_cookies = self.browser.get_cookie(cookies_name)
  845. self.browser.delete_cookie(cookies_name)
  846. now_cookies.update(cookies)
  847. self.browser.add_cookie(now_cookies)
  848. self.add_func(f"add_cookies:{cookies}", action)
  849. def get_cookies(self, cookies_name, **kwargs): # 获取指定曲奇
  850. @self.add_base
  851. def action(num, name, *args, **kwargs):
  852. nonlocal self
  853. self.element_dict[f"{name}[{num}]"] = [
  854. self.browser.get_cookie(cookies_name)
  855. ]
  856. self.add_func(f"get_cookies:{cookies_name}", action)
  857. def get_all_cookies(self, **kwargs): # 获取所有曲奇
  858. @self.add_base
  859. def action(num, name, *args, **kwargs):
  860. nonlocal self
  861. self.element_dict[f"{name}[{num}]"] = self.browser.get_cookie()
  862. self.add_func(f"get_all_cookies", action)
  863. @plugin_class_loading(get_path(r'template/crawler'))
  864. class PageParserBrowserActions(PageParserFunc):
  865. def back(self, **kwargs): # 返回
  866. @self.add_base
  867. def action(*args, **kwargs):
  868. nonlocal self
  869. self.browser.back()
  870. self.add_func(f"BACK", action)
  871. def forward(self, **kwargs): # 前进
  872. @self.add_base
  873. def action(*args, **kwargs):
  874. nonlocal self
  875. self.browser.forward()
  876. self.add_func(f"FORWARD", action)
  877. def refresh(self, **kwargs): # 刷新
  878. @self.add_base
  879. def action(*args, **kwargs):
  880. nonlocal self
  881. self.browser.refresh()
  882. self.add_func(f"REFRESH", action)
  883. def wait_sleep(self, time: int = 2, **kwargs): # 暴力等待
  884. @self.add_base
  885. def action(*args, **kwargs):
  886. nonlocal self
  887. sleep(time)
  888. self.add_func(f"WAIT:{time}s", action)
  889. def set_wait(self, time: int = 2, **kwargs): # 隐式等待
  890. @self.add_base
  891. def action(*args, **kwargs):
  892. nonlocal self
  893. sleep(time)
  894. self.add_func(f"Loading_wait:{time}s", action)
  895. class PageParserBrowser(PageParserBrowserActions, PageParserCookies):
  896. pass
  897. @plugin_class_loading(get_path(r'template/crawler'))
  898. class PageParserNeighbor(PageParserFunc):
  899. def __get_other_base(
  900. self, element_value, index: (slice, int), who="children", **kwargs
  901. ): # 获得子、后代、兄弟标签的基类
  902. @self.add_base
  903. def action(num, name, *args, **kwargs):
  904. nonlocal self
  905. iter_list = self.list_slicing(index, element_value)
  906. paser_list = []
  907. for bs in iter_list:
  908. if who != "brothers":
  909. paser_list += {
  910. "children": bs.children,
  911. "offspring": bs.descendants,
  912. "down": bs.next_siblings,
  913. "up": bs.previous_siblings,
  914. }.get(who, bs.children)
  915. else:
  916. paser_list += bs.previous_siblings
  917. paser_list += bs.next_siblings
  918. self.element_dict[f"{name}[{num}]"] = list(set(paser_list))
  919. self.add_func(f"get_{who}:{element_value}[{index}]", action) # 添加func
  920. def get_children(self, element_value, index: (slice, int), **kwargs):
  921. return self.__get_other_base(element_value, index)
  922. def get_offspring(self, element_value, index: (slice, int), **kwargs):
  923. return self.__get_other_base(element_value, index, "offspring")
  924. def get_up(self, element_value, index: (slice, int), **kwargs):
  925. return self.__get_other_base(element_value, index, "up")
  926. def get_down(self, element_value, index: (slice, int), **kwargs):
  927. return self.__get_other_base(element_value, index, "down")
  928. def get_brothers(self, element_value, index: (slice, int), **kwargs):
  929. return self.__get_other_base(element_value, index, "brothers")
  930. @plugin_class_loading(get_path(r'template/crawler'))
  931. class PageParserDataFindall(PageParserFunc):
  932. def findall(
  933. self,
  934. element_value,
  935. tag: (str, list),
  936. attribute: dict,
  937. limit,
  938. recursive,
  939. index: (slice, int),
  940. **kwargs,
  941. ): # 根据标签定位
  942. if isinstance(tag, str):
  943. tag = str(tag).split(",")
  944. try:
  945. limit = int(limit)
  946. except ValueError:
  947. limit = None
  948. @self.add_base
  949. def action(num, name, *args, **kwargs):
  950. nonlocal self
  951. iter_list = self.list_slicing(index, element_value)
  952. paser_list = []
  953. for bs in iter_list:
  954. try:
  955. re = bs.find_all(tag, attribute, limit=limit, recursive=recursive)
  956. except AttributeError:
  957. try:
  958. if str(bs.name) not in tag:
  959. raise PageParserError
  960. for agrs_name in attribute:
  961. text = attribute[agrs_name]
  962. if isinstance(text, str):
  963. if bs.attrs[agrs_name] != text:
  964. raise PageParserError
  965. else: # 正则匹配
  966. if not regular.match(text, bs.attrs[agrs_name]):
  967. raise PageParserError
  968. re = [bs]
  969. except PageParserError:
  970. re = []
  971. paser_list += re
  972. self.element_dict[f"{name}[{num}]"] = paser_list
  973. self.add_func(f"findAll:{element_value}[{index}]", action) # 添加func
  974. def findall_by_text(
  975. self,
  976. element_value,
  977. text: (regular.compile, str),
  978. limit,
  979. recursive,
  980. index: (slice, int),
  981. **kwargs,
  982. ): # 根据text定位
  983. try:
  984. limit = int(limit)
  985. except ValueError:
  986. limit = None
  987. @self.add_base
  988. def action(num, name, *args, **kwargs):
  989. nonlocal self
  990. iter_list = self.list_slicing(index, element_value)
  991. paser_list = []
  992. for bs in iter_list:
  993. try:
  994. re = bs.find_all(text=text, limit=limit, recursive=recursive)
  995. except AttributeError:
  996. try:
  997. if isinstance(text, str):
  998. if str(bs.string) != text:
  999. raise PageParserError
  1000. else:
  1001. if not regular.match(text, str(bs.string)):
  1002. raise PageParserError
  1003. re = [bs]
  1004. except PageParserError:
  1005. re = []
  1006. paser_list += re
  1007. self.element_dict[f"{name}[{num}]"] = paser_list
  1008. self.add_func(f"findAll_by_text:{element_value}[{index}]", action) # 添加func
  1009. @plugin_class_loading(get_path(r'template/crawler'))
  1010. class PageParserDatabase(PageParserFunc):
  1011. def to_database(
  1012. self, element_value, index, data: (str, list), database_name: str, **kwargs
  1013. ): # 传入data Base
  1014. @self.add_base
  1015. def action(*args, **kwargs):
  1016. global data_base
  1017. nonlocal self
  1018. iter_list = self.list_slicing(index, element_value)
  1019. for bs in iter_list:
  1020. new = []
  1021. for i in data:
  1022. if i == "$name&":
  1023. new.append(bs.name)
  1024. elif i == "$self&":
  1025. new.append(str(bs).replace("\n", ""))
  1026. elif i == "$string$":
  1027. new.append(str(bs.string).replace("\n", ""))
  1028. else:
  1029. new.append(bs.attrs.get(i, ""))
  1030. data_base.add_database(database_name, new)
  1031. self.add_func(
  1032. f"DataBase:{data}<{element_value}[{index}]>{database_name}", action
  1033. ) # 添加func
  1034. def to_database_by_re(
  1035. self, element_value, index, data: str, database_name: str, **kwargs
  1036. ): # 通过正则,传入dataBase
  1037. data = regular.compile(data)
  1038. @self.add_base
  1039. def action(*args, **kwargs):
  1040. global data_base
  1041. nonlocal self
  1042. iter_list = self.list_slicing(index, element_value)
  1043. for bs in iter_list:
  1044. new = regular.findall(data, str(bs))
  1045. data_base.add_database(database_name, new)
  1046. self.add_func(
  1047. f"DataBase:{data}<{element_value}[{index}]>{database_name}", action
  1048. ) # 添加func
  1049. @plugin_class_loading(get_path(r'template/crawler'))
  1050. class PageParserDataSource(PageParserFunc):
  1051. def to_text(self, **kwargs): # 获取网页源码
  1052. @self.add_base
  1053. def action(num, name, *args, **kwargs):
  1054. nonlocal self
  1055. try:
  1056. self.element_dict[f"{name}[{num}]"] = [
  1057. self.browser.page_source,
  1058. self.url_text,
  1059. ]
  1060. except AttributeError:
  1061. self.element_dict[f"{name}[{num}]"] = [
  1062. self.browser.text,
  1063. self.url_text,
  1064. ] # request
  1065. self.add_func(f"get_page_source", action)
  1066. def out_html(self, element_value, **kwargs): # 输出网页源码
  1067. @self.add_base
  1068. def action(*args, **kwargs):
  1069. nonlocal self
  1070. md5 = hashlib.md5() # 应用MD5算法
  1071. md5.update(f"{time.time()}_{self.url_text}".encode("utf-8"))
  1072. name = md5.hexdigest()
  1073. save_dir = self.dir + f"{os.sep}" + name + ".cotan_source"
  1074. print(save_dir)
  1075. with open(save_dir, "w") as f:
  1076. f.write(self.element_dict[element_value][0])
  1077. with open(save_dir + ".CoTanURL", "w") as f:
  1078. f.write(self.element_dict[element_value][1])
  1079. self.add_func(f"write_html<{element_value}", action)
  1080. def make_bs(self, element_value, **kwargs): # 解析成bs4对象
  1081. @self.add_base
  1082. def action(num, name, *args, **kwargs):
  1083. nonlocal self
  1084. self.element_dict[f"{name}[{num}]"] = [
  1085. bs4.BeautifulSoup(self.element_dict[element_value][0], "html.parser")
  1086. ]
  1087. self.add_func(f"Parsing:{element_value}", action) # 添加func
  1088. def add_url(
  1089. self,
  1090. element_value,
  1091. index: (slice, int),
  1092. url_name,
  1093. update_func,
  1094. url_args: dict,
  1095. **kwargs,
  1096. ): # 自动添加url
  1097. @self.add_base
  1098. def action(*args, **kwargs):
  1099. nonlocal self
  1100. iter_list = self.list_slicing(index, element_value)
  1101. for bs in iter_list:
  1102. try:
  1103. if url_name == "$name&":
  1104. new_url = bs.name
  1105. elif url_name == "$self&":
  1106. new_url = str(bs).replace("\n", "")
  1107. elif url_name == "$string$":
  1108. new_url = str(bs.string).replace("\n", "")
  1109. else:
  1110. new_url = bs.attrs.get(url_name, "")
  1111. self.downloader.url.add_url(new_url, **url_args)
  1112. except AttributeError:
  1113. pass
  1114. update_func() # 更新tkinter
  1115. self.add_func(f"add_URL<{element_value}[{index}]:{url_name}", action) # 添加func
  1116. def to_json(self, **kwargs):
  1117. @self.add_base
  1118. def action(num, name, *args, **kwargs):
  1119. nonlocal self
  1120. self.element_dict[f"{name}[{num}]"] = [
  1121. self.browser.json()
  1122. ] # request 解析为 json
  1123. self.add_func(f"to_json", action) # 添加func
  1124. @plugin_class_loading(get_path(r'template/crawler'))
  1125. class PageParserTool(PageParserFunc):
  1126. def list_slicing(self, index: (slice, int), element_value):
  1127. if isinstance(index, int):
  1128. return [self.element_dict[element_value][index]]
  1129. else:
  1130. return self.element_dict[element_value][index]
  1131. def get_by_path(
  1132. self, element_value, index: (slice, int), path, **kwargs
  1133. ): # 根据bs4的目录选择
  1134. @self.add_base
  1135. def action(num, name, *args, **kwargs):
  1136. nonlocal self
  1137. iter_list = self.list_slicing(index, element_value)
  1138. paser_list = []
  1139. for bs in iter_list:
  1140. try:
  1141. re = eval(str(path), {"self": bs})
  1142. if re is None:
  1143. raise PageParserError
  1144. paser_list.append(re)
  1145. except BaseException as e:
  1146. logging.warning(str(e))
  1147. self.element_dict[f"{name}[{num}]"] = paser_list
  1148. self.add_func(f"get>{path}:{element_value}[{index}]", action) # 添加func
  1149. def webpage_snapshot(self, **kwargs):
  1150. @self.add_base
  1151. def action(*args, **kwargs):
  1152. nonlocal self
  1153. md5 = hashlib.md5() # 应用MD5算法
  1154. md5.update(f"{time.time()}_{self.url_text}".encode("utf-8"))
  1155. name = md5.hexdigest()
  1156. with open(self.dir + f"{os.sep}" + name + ".png.CoTanURL", "w") as f:
  1157. f.write(self.url_text)
  1158. self.browser.save_screenshot(self.dir + f"{os.sep}" + name + ".png")
  1159. sleep(1)
  1160. self.add_func(f"Webpage_snapshot", action) # 添加func
  1161. class PageParserData(PageParserDatabase, PageParserDataSource, PageParserDataFindall,PageParserTool):
  1162. pass
  1163. @plugin_class_loading(get_path(r'template/crawler'))
  1164. class PageParserChainsWindow(PageParserFunc):
  1165. def get_all_windows(self, *args, **kwargs): # 获取所有句柄
  1166. @self.add_base
  1167. def find(num, name, *args, **kwargs):
  1168. nonlocal self
  1169. # 获得窗口句柄
  1170. self.element_dict[f"{name}[{num}]"] = self.browser.window_handles
  1171. self.add_func(f"get_all_windows", find) # 添加func
  1172. def get_now_windows(self, *args, **kwargs): # 获取当前窗口句柄
  1173. @self.add_base
  1174. def find(num, name, *args, **kwargs):
  1175. nonlocal self
  1176. self.element_dict[f"{name}[{num}]"] = [
  1177. self.browser.current_window_handle
  1178. ] # 获得当前窗口句柄
  1179. self.add_func(f"get_now_window", find) # 添加func
  1180. def switch_to_windwos(self, element_value, index=0, **kwargs): # 切换窗口
  1181. @self.add_base
  1182. def action(*args, **kwargs):
  1183. nonlocal self
  1184. self.browser.switch_to.window(self.element_dict[element_value][index])
  1185. self.add_func(f"switch_to_window>{element_value}[{index}]", action) # 添加func
  1186. @plugin_class_loading(get_path(r'template/crawler'))
  1187. class PageParserClick(PageParserFunc):
  1188. def action_click(self, chains, element_value, index, **kwargs): # 单击左
  1189. @self.add_base
  1190. def action(*args, **kwargs):
  1191. nonlocal self
  1192. self.element_dict[chains][0].click(self.element_dict[element_value][index])
  1193. self.add_func(f"[{chains}]click>[{element_value}][{index}]", action) # 添加func
  1194. def action_double_click(self, chains, element_value, index, **kwargs): # 双击左
  1195. @self.add_base
  1196. def action(*args, **kwargs):
  1197. nonlocal self
  1198. self.element_dict[chains][0].double_click(
  1199. self.element_dict[element_value][index]
  1200. )
  1201. self.add_func(
  1202. f"[{chains}]double_click>[{element_value}][{index}]", action
  1203. ) # 添加func
  1204. def action_click_right(self, chains, element_value, index, **kwargs): # 点击右
  1205. @self.add_base
  1206. def action(*args, **kwargs):
  1207. nonlocal self
  1208. self.element_dict[chains][0].context_click(
  1209. self.element_dict[element_value][index]
  1210. )
  1211. self.add_func(
  1212. f"[{chains}]right_click>[{element_value}][{index}]", action
  1213. ) # 添加func
  1214. def action_click_and_hold(self, chains, element_value, index, **kwargs): # 按住左
  1215. @self.add_base
  1216. def action(*args, **kwargs):
  1217. nonlocal self
  1218. self.element_dict[chains][0].click_and_hold(
  1219. self.element_dict[element_value][index]
  1220. )
  1221. self.add_func(
  1222. f"[{chains}]click_and_hold>[{element_value}][{index}]", action
  1223. ) # 添加func
  1224. @plugin_class_loading(get_path(r'template/crawler'))
  1225. class PageParserChainsMouse(PageParserFunc):
  1226. def action_release(self, chains, element_value, index, **kwargs): # 松开左键
  1227. @self.add_base
  1228. def action(*args, **kwargs):
  1229. nonlocal self
  1230. self.element_dict[chains][0].release(
  1231. self.element_dict[element_value][index]
  1232. )
  1233. self.add_func(f"[{chains}]release>[{element_value}][{index}]", action) # 添加func
  1234. def action_drag_and_drop(
  1235. self, chains, element_value, index, element_value2, index2, **kwargs
  1236. ): # 拽托、松开
  1237. @self.add_base
  1238. def action(*args, **kwargs):
  1239. nonlocal self
  1240. self.element_dict[chains][0].drag_and_drop(
  1241. self.element_dict[element_value][index],
  1242. self.element_dict[element_value2][index2],
  1243. )
  1244. self.add_func(
  1245. f"[{chains}]drag_and_drop>[{element_value}][{index}]", action
  1246. ) # 添加func
  1247. def action_move(self, chains, element_value, index, **kwargs): # 移动鼠标
  1248. @self.add_base
  1249. def action(*args, **kwargs):
  1250. nonlocal self
  1251. self.element_dict[chains][0].move_to_element(
  1252. self.element_dict[element_value][index]
  1253. )
  1254. self.add_func(
  1255. f"[{chains}]drag_and_drop>[{element_value}][{index}]", action
  1256. ) # 添加func
  1257. @plugin_class_loading(get_path(r'template/crawler'))
  1258. class PageParserChainsKeys(PageParserFunc):
  1259. @staticmethod
  1260. def special_keys(key: str, is_special_keys):
  1261. if is_special_keys:
  1262. return keys_name_dict.get(key.lower(), key), f"[{key.upper()}]"
  1263. else:
  1264. return key, key
  1265. def action_key_down(
  1266. self, chains, key, element_value, index, is_special_keys, **kwargs
  1267. ): # down
  1268. new_key, key = self.special_keys(key, is_special_keys)
  1269. @self.add_base
  1270. def action(*args, **kwargs):
  1271. nonlocal self
  1272. self.element_dict[chains][0].key_down(
  1273. new_key, self.element_dict[element_value][index]
  1274. )
  1275. self.add_func(
  1276. f"[{chains}]key_down>{key}:[{element_value}][{index}]", action
  1277. ) # 添加func
  1278. def action_key_up(
  1279. self, chains, key, element_value, index, is_special_keys, **kwargs
  1280. ): # down
  1281. new_key, key = self.special_keys(key, is_special_keys)
  1282. @self.add_base
  1283. def action(*args, **kwargs):
  1284. nonlocal self
  1285. self.element_dict[chains][0].key_up(
  1286. new_key, self.element_dict[element_value][index]
  1287. )
  1288. self.add_func(
  1289. f"[{chains}]key_up>{key}:[{element_value}][{index}]", action
  1290. ) # 添加func
  1291. def action_send_keys_to_element(
  1292. self, chains, key, element_value, index, is_special_keys, **kwargs
  1293. ):
  1294. new_key, key = self.special_keys(key, is_special_keys)
  1295. @self.add_base
  1296. def action(*args, **kwargs):
  1297. nonlocal self
  1298. self.element_dict[chains][0].send_keys_to_element(
  1299. self.element_dict[element_value][index], new_key
  1300. )
  1301. self.add_func(
  1302. f"[{chains}]sent>{key}:[{element_value}][{index}]", action
  1303. ) # 添加func
  1304. def action_send_keys(self, chains, key, is_special_keys, **kwargs): # 发送到焦点元素
  1305. new_key, key = self.special_keys(key, is_special_keys)
  1306. @self.add_base
  1307. def action(*args, **kwargs):
  1308. nonlocal self
  1309. self.element_dict[chains][0].send_keys(new_key)
  1310. self.add_func(f"[{chains}].sent>{key}", action) # 添加func
  1311. @plugin_class_loading(get_path(r'template/crawler'))
  1312. class PageParserChains(PageParserChainsWindow, PageParserClick, PageParserChainsMouse,
  1313. PageParserChainsKeys):
  1314. def make_action_chains(self, **kwargs): # 创建动作链
  1315. @self.add_base
  1316. def action(num, name, *args, **kwargs):
  1317. nonlocal self
  1318. self.element_dict[f"{name}[{num}]"] = [ActionChains(self.browser)]
  1319. self.add_func(f"make_ActionChains", action) # 添加func
  1320. def action_run(self, chains, run_time=1, **kwargs): # 执行
  1321. @self.add_base
  1322. def action(*args, **kwargs):
  1323. nonlocal self
  1324. self.element_dict[chains][0].perform()
  1325. sleep(run_time)
  1326. self.add_func(f"[{chains}].run<{run_time}s", action) # 添加func
  1327. for i in range(1, 13): # F1 - F12按键
  1328. keys_name_dict[f"f{i}"] = eval(f"Keys.F{i}", {'Keys': Keys})
  1329. data_base = DatabaseController()