Pārlūkot izejas kodu

新增:URL回调和解析json

Huan 5 gadi atpakaļ
vecāks
revīzija
a611c52ab5
2 mainītis faili ar 54 papildinājumiem un 10 dzēšanām
  1. 30 2
      Crawler_controller.py
  2. 24 8
      Web_Crawler.py

+ 30 - 2
Crawler_controller.py

@@ -203,7 +203,7 @@ class Page_Downloader:
             except:
             except:
                 pass
                 pass
             self.start_cookies(func_cookie,url)
             self.start_cookies(func_cookie,url)
-        else:
+        else:#requests模式
             try:
             try:
                 args = {'cookies':self.cookie_dict[self.nowurl.cookies]}
                 args = {'cookies':self.cookie_dict[self.nowurl.cookies]}
                 func_cookie([args['cookies']])
                 func_cookie([args['cookies']])
@@ -754,7 +754,7 @@ class Page_Parser:
 
 
     def Webpage_snapshot(self,**kwargs):
     def Webpage_snapshot(self,**kwargs):
         @self.add_base
         @self.add_base
-        def action(num, name, *args, **kwargs):
+        def action(*args, **kwargs):
             nonlocal self
             nonlocal self
             md5 = hashlib.md5()  # 应用MD5算法
             md5 = hashlib.md5()  # 应用MD5算法
             md5.update(f'{time.time()}_{self.now_url}'.encode('utf-8'))
             md5.update(f'{time.time()}_{self.now_url}'.encode('utf-8'))
@@ -765,6 +765,34 @@ class Page_Parser:
             sleep(1)
             sleep(1)
         self.add_func(f'Webpage_snapshot', action)  # 添加func
         self.add_func(f'Webpage_snapshot', action)  # 添加func
 
 
+    def add_url(self, element_value, index: (slice, int), url_name,update_func,url_args:dict, **kwargs):# 自动添加url
+        @self.add_base
+        def action(*args, **kwargs):
+            nonlocal self
+            iter_list = self.listSlicing(index, element_value)
+            for bs in iter_list:
+                try:
+                    if url_name == '$name&':
+                        new_url = bs.name
+                    elif url_name == '$self&':
+                        new_url = str(bs).replace('\n', '')
+                    elif url_name == '$string$':
+                        new_url = str(bs.string).replace('\n', '')
+                    else:
+                        new_url = bs.attrs.get(url_name, '')
+                    url.add_url(new_url, **url_args)
+                except:
+                    pass
+            update_func()#更新tkinter
+        self.add_func(f'add_URL<{element_value}[{index}]:{url_name}', action)  # 添加func
+
+    def to_json(self,**kwargs):
+        @self.add_base
+        def action(num, name, *args, **kwargs):
+            nonlocal self
+            self.element_dict[f'{name}[{num}]'] = [self.browser.json()]#request 解析为 json
+        self.add_func(f'to_json', action)  # 添加func
+
     def Element_interaction(self,update_func=lambda *args:None):#元素交互
     def Element_interaction(self,update_func=lambda *args:None):#元素交互
         func_list = self.func_list
         func_list = self.func_list
         status = None
         status = None

+ 24 - 8
Web_Crawler.py

@@ -450,7 +450,7 @@ def Main():
     tkinter.Button(top, bg=bbg, fg=fg, text='关闭数据表', command=close, font=FONT,
     tkinter.Button(top, bg=bbg, fg=fg, text='关闭数据表', command=close, font=FONT,
                    width=width_B, height=height_B).grid(column=a_x + 2, row=a_y, sticky=tkinter.E + tkinter.W)
                    width=width_B, height=height_B).grid(column=a_x + 2, row=a_y, sticky=tkinter.E + tkinter.W)
 
 
-    global Data_Input,DataBase_BOX,DataName_Input
+    global Data_Input,DataBase_BOX,DataName_Input,URLTAG_Input
     a_y += 1
     a_y += 1
     tkinter.Label(top, text='数据存入格式:', bg=bg, fg=fg, font=FONT, width=width_B, height=height_B).grid(column=a_x,row=a_y)
     tkinter.Label(top, text='数据存入格式:', bg=bg, fg=fg, font=FONT, width=width_B, height=height_B).grid(column=a_x,row=a_y)
     Data_Input = tkinter.Entry(top, width=width_B * 2)
     Data_Input = tkinter.Entry(top, width=width_B * 2)
@@ -466,20 +466,26 @@ def Main():
     DataBase_BOX.grid(column=a_x, row=a_y, columnspan=3, rowspan=3, sticky=tkinter.E + tkinter.W + tkinter.S + tkinter.N)
     DataBase_BOX.grid(column=a_x, row=a_y, columnspan=3, rowspan=3, sticky=tkinter.E + tkinter.W + tkinter.S + tkinter.N)
 
 
     a_y += 3
     a_y += 3
+    tkinter.Label(top, text='URL标签:', bg=bg, fg=fg, font=FONT, width=width_B, height=height_B).grid(column=a_x,row=a_y)
+    URLTAG_Input = tkinter.Entry(top, width=width_B * 2)
+    URLTAG_Input.grid(column=a_x + 1, row=a_y, columnspan=2, sticky=tkinter.E + tkinter.W)
+
+    a_y += 1
     tkinter.Button(top, bg=bbg, fg=fg, text='导出页面快照',command=lambda :Page_Parser_addActionFunc2('png'), font=FONT, width=width_B,height=height_B).grid(
     tkinter.Button(top, bg=bbg, fg=fg, text='导出页面快照',command=lambda :Page_Parser_addActionFunc2('png'), font=FONT, width=width_B,height=height_B).grid(
         column=a_x, row=a_y, sticky=tkinter.E + tkinter.W)
         column=a_x, row=a_y, sticky=tkinter.E + tkinter.W)
-    tkinter.Button(top, bg=bbg, fg=fg, text='NONE',command=out, font=FONT, width=width_B,height=height_B).grid(
+    tkinter.Button(top, bg=bbg, fg=fg, text='回调添加URL',command=add_url_from_tag, font=FONT, width=width_B,height=height_B).grid(
         column=a_x+1, row=a_y, sticky=tkinter.E + tkinter.W)
         column=a_x+1, row=a_y, sticky=tkinter.E + tkinter.W)
-    tkinter.Button(top, bg=bbg, fg=fg, text='NONE', command=close, font=FONT,
+    tkinter.Button(top, bg=bbg, fg=fg, text='解析为json', command=lambda :Page_Parser_addActionFunc2('to_json'), font=FONT,
                    width=width_B, height=height_B).grid(column=a_x + 2, row=a_y, sticky=tkinter.E + tkinter.W)
                    width=width_B, height=height_B).grid(column=a_x + 2, row=a_y, sticky=tkinter.E + tkinter.W)
 
 
     top.update()#要预先update一下,否则会卡住
     top.update()#要预先update一下,否则会卡住
     global url,loader,Page_Parser,DataBase,save_dir
     global url,loader,Page_Parser,DataBase,save_dir
     save_dir = askdirectory(title='选择项目位置')#项目位置
     save_dir = askdirectory(title='选择项目位置')#项目位置
-    url = Crawler_controller.url(save_dir,save_dir)
-    loader = Crawler_controller.Page_Downloader(url,save_dir)
-    Page_Parser = Crawler_controller.Page_Parser(loader)
-    DataBase = Crawler_controller.data_base
+    url = Crawler_controller.url(save_dir,save_dir)#url管理器
+    loader = Crawler_controller.Page_Downloader(url,save_dir)#页面下载器
+    Page_Parser = Crawler_controller.Page_Parser(loader)#页面解析器
+    DataBase = Crawler_controller.data_base#数据库
+
     top.mainloop()
     top.mainloop()
 
 
 def to_Database(is_tag=True):
 def to_Database(is_tag=True):
@@ -613,7 +619,7 @@ def Page_Parser_addActionFunc2(func):
             'make_bs':Page_Parser.make_bs,'findAll':Page_Parser.findAll,'findAll_by_text':Page_Parser.findAll_by_text,
             'make_bs':Page_Parser.make_bs,'findAll':Page_Parser.findAll,'findAll_by_text':Page_Parser.findAll_by_text,
             'get_children':Page_Parser.get_children,'get_offspring':Page_Parser.get_offspring,'get_up':Page_Parser.get_up,
             'get_children':Page_Parser.get_children,'get_offspring':Page_Parser.get_offspring,'get_up':Page_Parser.get_up,
             'get_down':Page_Parser.get_down,'get_by_path':Page_Parser.get_by_path,'brothers':Page_Parser.get_brothers,
             'get_down':Page_Parser.get_down,'get_by_path':Page_Parser.get_by_path,'brothers':Page_Parser.get_brothers,
-            'png':Page_Parser.Webpage_snapshot}.get(func,Page_Parser.make_bs)
+            'png':Page_Parser.Webpage_snapshot,'to_json':Page_Parser.to_json}.get(func,Page_Parser.make_bs)
     FUNC(**args)
     FUNC(**args)
     Update_Parser_Func_BOX()
     Update_Parser_Func_BOX()
 
 
@@ -768,6 +774,16 @@ def add_url():
     url.add_url(new_url,**args)
     url.add_url(new_url,**args)
     update_URLBOX()
     update_URLBOX()
 
 
+def add_url_from_tag():
+    global URLTAG_Input,Page_Parser,Var_Input
+    try:
+        index = eval(VarIndex_Input.get(),{})
+    except:
+        index = slice(None,None)
+    Page_Parser.add_url(element_value=Var_Input.get(),index=index,url_name=URLTAG_Input.get(),update_func=update_URLBOX,
+                        url_args=add_args())
+    Update_Parser_Func_BOX()
+
 def update_URLBOX():
 def update_URLBOX():
     global url,URL_BOX
     global url,URL_BOX
     URL_BOX.delete(0,tkinter.END)
     URL_BOX.delete(0,tkinter.END)