1
0

Web_Crawler.py 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253
  1. import Crawler_controller
  2. import os
  3. import tkinter
  4. from tkinter.filedialog import askdirectory
  5. import re
  6. import threading
  7. import time
  8. def Main():
  9. global top,Git,PATH,bg,bbg,fg,cookies_list
  10. PATH = os.getcwd()
  11. top = tkinter.Tk()
  12. cookies_list = []
  13. bg = '#FFFAFA' # 主颜色
  14. bbg = '#FFFAFA' # 按钮颜色
  15. fg = '#000000' # 文字颜色
  16. top["bg"] = bg
  17. FONT = ('黑体', 11) # 设置字体
  18. top.title('CoTan仓库管理器')
  19. top.resizable(width=False, height=False)
  20. top.geometry('+10+10') # 设置所在位置
  21. width_B = 13 # 标准宽度
  22. height_B = 2
  23. a_y = 0
  24. a_x = 0
  25. tkinter.Button(top, bg=bbg, fg=fg, text='添加url',command=add_url , font=FONT, width=width_B,
  26. height=height_B).grid(column=a_x, row=a_y, sticky=tkinter.E + tkinter.W)
  27. tkinter.Button(top, bg=bbg, fg=fg, text='删除url',command=del_url , font=FONT, width=width_B,
  28. height=height_B).grid(column=a_x+1, row=a_y, sticky=tkinter.E + tkinter.W)
  29. tkinter.Button(top, bg=bbg, fg=fg, text='应用过滤机制', font=FONT, width=width_B,
  30. height=height_B).grid(column=a_x+2, row=a_y, sticky=tkinter.E + tkinter.W)
  31. global URL_BOX,URL_Input,Func_BOX
  32. a_y += 1
  33. tkinter.Label(top, text='添加url:', bg=bg, fg=fg, font=FONT, width=width_B, height=height_B).grid(column=a_x,row=a_y)
  34. URL_Input = tkinter.Entry(top, width=width_B * 2)
  35. URL_Input.grid(column=a_x + 1, row=a_y, columnspan=2, sticky=tkinter.E + tkinter.W)
  36. a_y += 1
  37. URL_BOX = tkinter.Listbox(top, width=width_B * 3, height=height_B * 3)
  38. URL_BOX.grid(column=a_x, row=a_y, columnspan=3, rowspan=3, sticky=tkinter.E + tkinter.W + tkinter.S + tkinter.N)
  39. a_y += 3
  40. tkinter.Button(top, bg=bbg, fg=fg, text='HTTPS过滤',command=add_filter_func_HTTPS, font=FONT, width=width_B,height=height_B).grid(
  41. column=a_x, row=a_y, sticky=tkinter.E + tkinter.W)
  42. tkinter.Button(top, bg=bbg, fg=fg, text='WWW过滤',command=add_filter_func_WWW, font=FONT, width=width_B,height=height_B).grid(
  43. column=a_x+1, row=a_y, sticky=tkinter.E + tkinter.W)
  44. tkinter.Button(top, bg=bbg, fg=fg, text='删除过滤',command=del_func, font=FONT, width=width_B,height=height_B).grid(
  45. column=a_x+2, row=a_y, sticky=tkinter.E + tkinter.W)
  46. a_y += 1
  47. tkinter.Button(top, bg=bbg, fg=fg, text='自定义过滤',command=add_filter_func_HTTPS, font=FONT, width=width_B,height=height_B).grid(
  48. column=a_x, row=a_y,columnspan=2, sticky=tkinter.E + tkinter.W)
  49. tkinter.Button(top, bg=bbg, fg=fg, text='清空过滤', font=FONT, width=width_B,height=height_B).grid(
  50. column=a_x+2, row=a_y, sticky=tkinter.E + tkinter.W)
  51. global Func_BOX,cookies_fixed
  52. a_y += 1
  53. Func_BOX = tkinter.Listbox(top, width=width_B * 3, height=height_B * 2)
  54. Func_BOX.grid(column=a_x, row=a_y, columnspan=3, rowspan=2, sticky=tkinter.E + tkinter.W + tkinter.S + tkinter.N)
  55. global wait_Func_BOX,Wait_Input,cookies_BOX
  56. a_y += 2
  57. tkinter.Button(top, bg=bbg, fg=fg, text='执行网页下载',command=startDownloader, font=FONT, width=width_B,height=height_B).grid(
  58. column=a_x, row=a_y, sticky=tkinter.E + tkinter.W)
  59. tkinter.Button(top, bg=bbg, fg=fg, text='显式等待',command=add_time_wait, font=FONT, width=width_B,height=height_B).grid(
  60. column=a_x+1, row=a_y, sticky=tkinter.E + tkinter.W)
  61. Wait_Input = tkinter.Entry(top, width=width_B)
  62. Wait_Input.grid(column=a_x + 2, row=a_y, sticky=tkinter.E + tkinter.W)
  63. a_y += 3
  64. tkinter.Button(top, bg=bbg, fg=fg, text='自定义等待策略',command=add_filter_func_HTTPS, font=FONT, width=width_B,height=height_B).grid(
  65. column=a_x, row=a_y, sticky=tkinter.E + tkinter.W)
  66. tkinter.Button(top, bg=bbg, fg=fg, text='删除等待策略',command=del_waitfunc, font=FONT, width=width_B,height=height_B).grid(
  67. column=a_x+1, row=a_y, sticky=tkinter.E + tkinter.W)
  68. tkinter.Button(top, bg=bbg, fg=fg, text='清空等待策略',command=del_func, font=FONT, width=width_B,height=height_B).grid(
  69. column=a_x+2, row=a_y, sticky=tkinter.E + tkinter.W)
  70. a_y += 1
  71. wait_Func_BOX = tkinter.Listbox(top, width=width_B * 3, height=height_B * 2)
  72. wait_Func_BOX.grid(column=a_x, row=a_y, columnspan=3, rowspan=2, sticky=tkinter.E + tkinter.W + tkinter.S + tkinter.N)
  73. a_y += 2
  74. cookies_fixed = tkinter.Variable()
  75. tkinter.Label(top, text='【曲奇监视】', bg=bg, fg=fg, font=FONT, width=width_B, height=height_B).grid(
  76. column=a_x+1,row=a_y,sticky=tkinter.E + tkinter.W + tkinter.W + tkinter.S + tkinter.N) # 设置说明
  77. tkinter.Checkbutton(top, bg=bg, fg=fg, activebackground=bg, activeforeground=fg, selectcolor=bg, text='固定曲奇',
  78. variable=cookies_fixed).grid(column=a_x + 2, row=a_y, sticky=tkinter.W)
  79. cookies_fixed.set('0')
  80. a_y += 1
  81. cookies_BOX = tkinter.Listbox(top, width=width_B * 3, height=height_B * 2)
  82. cookies_BOX.grid(column=a_x, row=a_y, columnspan=3, rowspan=2, sticky=tkinter.E + tkinter.W + tkinter.S + tkinter.N)
  83. a_y += 2
  84. tkinter.Button(top, bg=bbg, fg=fg, text='清空曲奇',command=Tra_cookies, font=FONT, width=width_B,height=height_B).grid(
  85. column=a_x, row=a_y, sticky=tkinter.E + tkinter.W)
  86. tkinter.Button(top, bg=bbg, fg=fg, text='更新曲奇',command=Update_cookies, font=FONT, width=width_B,height=height_B).grid(
  87. column=a_x+1, row=a_y, sticky=tkinter.E + tkinter.W)
  88. tkinter.Button(top, bg=bbg, fg=fg, text='删除曲奇',command=Del_cookies, font=FONT, width=width_B,height=height_B).grid(
  89. column=a_x+2, row=a_y, sticky=tkinter.E + tkinter.W)
  90. global cookies_Input,PAGE_BOX
  91. a_y += 1
  92. cookies_Input = tkinter.Entry(top, width=width_B * 2)
  93. cookies_Input.grid(column=a_x, row=a_y, columnspan=2, sticky=tkinter.E + tkinter.W)
  94. tkinter.Button(top, bg=bbg, fg=fg, text='添加曲奇',command=Add_cookies, font=FONT, width=width_B,height=height_B).grid(
  95. column=a_x+2, row=a_y, sticky=tkinter.E + tkinter.W)
  96. a_y += 1
  97. cookies_fixed = tkinter.Variable()
  98. tkinter.Label(top, text='【已存储页面】', bg=bg, fg=fg, font=FONT, width=width_B, height=height_B).grid(
  99. column=a_x,row=a_y,columnspan=3,sticky=tkinter.E + tkinter.W + tkinter.W + tkinter.S + tkinter.N)
  100. a_y += 1
  101. PAGE_BOX = tkinter.Listbox(top, width=width_B * 3, height=height_B * 2)
  102. PAGE_BOX.grid(column=a_x, row=a_y, columnspan=3, rowspan=2, sticky=tkinter.E + tkinter.W + tkinter.S + tkinter.N)
  103. top.update()#要预先update一下,否则会卡住
  104. global url,loader
  105. save_dir = askdirectory(title='选择项目位置')#项目位置
  106. url = Crawler_controller.url(save_dir,save_dir)
  107. loader = Crawler_controller.Page_Downloader(url,save_dir)
  108. top.mainloop()
  109. def PAGE_BOX_Update(PAGE_list):
  110. global PAGE_BOX
  111. PAGE_BOX.delete(0,tkinter.END)
  112. PAGE_BOX.insert(0,*PAGE_list)
  113. def Update_cookies():
  114. global cookies_BOX,cookies_list,cookies_Input
  115. cookies = eval(cookies_Input.get(),{})
  116. if cookies_fixed.get() == '0':return False
  117. try:
  118. name = cookies_list[cookies_BOX.curselection()[0]].get('name')
  119. loader.update_cookies(name,cookies)
  120. cookies_fixed.set('0')
  121. except:
  122. pass
  123. def Add_cookies():
  124. global cookies_BOX,cookies_list,cookies_Input
  125. cookies = eval(cookies_Input.get(),{})
  126. if cookies_fixed.get() == '0':return False
  127. try:
  128. loader.Add_cookies(cookies)
  129. cookies_fixed.set('0')
  130. except:
  131. raise
  132. def Tra_cookies():
  133. global cookies_BOX,cookies_list
  134. if cookies_fixed.get() == '0':return False
  135. try:
  136. loader.Tra_cookies()
  137. cookies_fixed.set('0')
  138. except:
  139. pass
  140. def Del_cookies():
  141. global cookies_BOX,cookies_list
  142. if cookies_fixed.get() == '0':return False
  143. try:
  144. name = cookies_list[cookies_BOX.curselection()[0]].get('name')
  145. print(name)
  146. loader.Del_cookies(name)
  147. cookies_fixed.set('0')
  148. except:
  149. pass
  150. def cookies_BOX_Update(cookies):
  151. global cookies_BOX,cookies_list
  152. if cookies_fixed.get() == '0':
  153. cookies_list = cookies
  154. cookies_BOX.delete(0,tkinter.END)
  155. cookies_BOX.insert(0,*cookies)
  156. def add_time_wait():#显式等待一定s
  157. global url,Wait_Input
  158. times = float(Wait_Input.get())
  159. def wait_time(*args):
  160. time.sleep(times)
  161. return True,f'After_{time}s'
  162. loader.Add_func(wait_time,f'wait {times}s')
  163. update_Wait_Input()
  164. def del_waitfunc():#删除策略
  165. global wait_Func_BOX
  166. index = wait_Func_BOX.curselection()[0]
  167. loader.Del_func(index)
  168. update_Wait_Input()
  169. def update_Wait_Input():
  170. global loader,wait_Func_BOX
  171. wait_Func_BOX.delete(0,tkinter.END)
  172. wait_Func_BOX.insert(tkinter.END,*loader.return_func())
  173. def startDownloader():
  174. def startLoader():
  175. global loader
  176. loader.strat_urlGet()
  177. loader.Logical_operation(cookies_BOX_Update,PAGE_BOX_Update)
  178. loader.save_Page()
  179. new = threading.Thread(target=startLoader)
  180. new.start()
  181. update_URLBOX()
  182. def add_filter_func_HTTPS():
  183. global url
  184. url.Add_func(lambda url:re.match(re.compile('^https://'),url),'HTTPS过滤')
  185. update_Func_BOX()
  186. def add_filter_func_WWW():
  187. global url
  188. url.Add_func(lambda url:re.match(re.compile('.*www\.'),url),'www过滤')
  189. update_Func_BOX()
  190. def del_func():
  191. global URL_BOX
  192. index = Func_BOX.curselection()[0]
  193. url.Del_func(index)
  194. update_Func_BOX()
  195. def update_Func_BOX():
  196. global url,Func_BOX
  197. Func_BOX.delete(0,tkinter.END)
  198. Func_BOX.insert(tkinter.END,*url.return_func())
  199. def del_url():
  200. global URL_BOX
  201. index = URL_BOX.curselection()[0]
  202. url.del_url(index)
  203. update_URLBOX()
  204. def add_url():
  205. global URL_Input,url
  206. new_url = URL_Input.get()
  207. if new_url == '':return
  208. url.add_url(new_url)
  209. update_URLBOX()
  210. def update_URLBOX():
  211. global url,URL_BOX
  212. URL_BOX.delete(0,tkinter.END)
  213. URL_BOX.insert(tkinter.END,*url.return_url())
  214. if __name__ == "__main__":
  215. Main()