Ver código fonte

URL控制器,新增两种页面数据:post和get

Huan 5 anos atrás
pai
commit
9f6c133603
3 arquivos alterados com 217 adições e 40 exclusões
  1. 149 29
      Crawler_controller.py
  2. 1 1
      Information_storage.py
  3. 67 10
      Web_Crawler.py

+ 149 - 29
Crawler_controller.py

@@ -8,21 +8,97 @@ from time import sleep
 import bs4
 import re as regular
 import Information_storage
+import requests
 
 data_base = Information_storage.DataBase_Home()
 
-class URL_PAGE():
-    def __init__(self,url,func='get'):
+class PAGE:
+    def __init__(self):
+        self.url=''
+        self.UA=''
+        self.func = 'PAGE'
+
+    def __str__(self):
+        return f'{self.func}-{self.url}:UA>{self.UA}'
+
+class REQUESTS_Base(PAGE):
+    def init(self,UA,url,cookies):
+        if UA == '':
+            UA = f'--user-agent ="Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' \
+                 f'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36 Edg/80.0.361.66"'
+        self.UA = UA
+        self.headers = {'Accept': 'text/html, application/xhtml+xml, image/jxr, */*',
+                   'Accept - Encoding': 'gzip, deflate',
+                   'Accept-Language': 'zh-Hans-CN, zh-Hans; q=0.5',
+                   'Connection': 'Keep-Alive',
+                   'User-Agent': UA}
+        self.requests = lambda *args:None
+        self.url = url
+        self.cookies = cookies
+        self.new = True
+
+class URL_POST(REQUESTS_Base):#通过requests的post请求
+    def __init__(self, url, data,UA='',cookies=None, **kwargs):
+        super(URL_POST, self).__init__()
+        self.func = 'post'
+        self.data = data
+        self.requests = requests.post
+        self.init(UA,url,cookies)
+
+    def __str__(self):
+        return super(URL_POST, self).__str__() + f';data>{self.data}'
+
+class URL_GET(REQUESTS_Base):#通过requests的post请求
+    def __init__(self, url,UA='',cookies=None, **kwargs):
+        super(URL_GET, self).__init__()
+        self.func = 'simplify_get'
+        self.requests = requests.get
+        self.init(UA,url,cookies)
+
+class URL_PAGE(PAGE):
+    def __init__(self,url,first_run=False,head=False,no_plugins=True,no_js=False,no_java=False,
+                 no_img=False,UA='',cookies=None,new=False,down_load_dir='',**kwargs):
+        super(URL_PAGE, self).__init__()
         self.url = url
-        self.func = func
+        self.func = 'get'
+        self.options = webdriver.ChromeOptions()
+        self.cookies = cookies#cookies存储位置
+        self.new = new#新键页面or新键浏览器
+        self.down_load_dir = down_load_dir
+        self.init(first_run,head,no_plugins,no_js,no_java,no_img,UA)
+
+    def init(self,first_run,head,no_plugins,no_js,no_java,no_img,UA):
+        self.options.add_argument('disable-infobars')#不显示
+        prefs = {'profile.default_content_settings.popups': 0, 'download.default_directory':self.down_load_dir}
+        self.options.add_experimental_option('prefs', prefs)#下载设置
+        if first_run:
+            self.options.add_argument('-first run')
+        if head:#无头设置
+            print('FFF')
+            self.options.add_argument('--headless')
+            self.options.add_argument('--disable-gpu')
+        if no_plugins:
+            self.options.add_argument('--disable-plugins')
+        if no_js:
+            self.options.add_argument('--disable-javascript')
+        if no_java:
+            self.options.add_argument('--disable-java')
+        if no_img:
+            self.options.add_argument('blink-settings=imagesEnabled=false')
+        if UA == '':
+            UA = (f'user-agent ="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
+                  f'Chrome/80.0.3987.132 Safari/537.36"')
+        # self.options.add_argument(f'--user-agent ="{UA}"')
+        self.UA = UA
 
     def __str__(self):
-        return self.url
+        return f'{self.func}-{self.url}:UA>{self.UA}'
 
 class url:#url管理器
     num = 0#url处理器个数
     def __init__(self,dic=f'',dic_run=f''):
         url.num += 1
+        self.save_dir = dic
         dic += f'/url[{url.num}].cot_url'
         dic_run += f'/url_run[{url.num}].cot_url'
         self.dir = dic
@@ -33,7 +109,7 @@ class url:#url管理器
         self.url_history = []#url历史
         self.filter = {}#过滤函数
 
-    def filter_func(self,url):#url过滤系统
+    def filter_func(self,url,**kwargs):#url过滤系统
         for i in self.filter:
             if not self.filter[i](url): return False
         return True
@@ -47,11 +123,21 @@ class url:#url管理器
     def return_func(self):
         return list(self.filter.keys())
 
-    def add_url(self,url):#添加url
-        if url not in self.url_history and self.filter_func(url):#1.url不存在历史,2.url满足筛选条件
-            self.url_list.append(URL_PAGE(url,'get'))#添加到待取得url
-            self.url_history.append(url)#添加到历史url
-            self.__out_url(url)#输出历史url
+    def add_url(self,url,func,data=None,**kwargs):#添加url
+        if func == '':func = 'simplify_get'
+        if func == 'get':url_ = url
+        else:
+            url_ = url + str(data)
+        if url_ not in self.url_history and self.filter_func(url,func=func):#1.url不存在历史,2.url满足筛选条件
+            if func == 'get':
+                self.url_list.append(URL_PAGE(url=url,**kwargs,down_load_dir=self.dir))#添加到待取得url
+            elif func == 'simplify_get':
+                self.url_list.append(URL_GET(url=url, **kwargs, down_load_dir=self.dir))  # 添加到待取得url
+            else:
+                self.url_list.append(URL_POST(url=url,data=data,**kwargs))  # 添加到待取得url
+
+            self.url_history.append(url_)#添加到历史url
+            self.__out_url(url_)#输出历史url
             return True#写入成功
         return False#写入失败
 
@@ -59,7 +145,7 @@ class url:#url管理器
         self.__out_url_run(f'DELETE {self.url_list[index]}')
         del self.url_list[index]
 
-    def get_url(self) -> URL_PAGE:#取得url
+    def get_url(self) -> (URL_PAGE,URL_POST):#取得url
         url_page = self.url_list[0]
         self.__out_url_run(url_page.url)
         del self.url_list[0]
@@ -88,35 +174,66 @@ class Page_Downloader:
         self.page_source_dict = {}#页面保存信息
         self.cookie_Thread = None#子进程
         self.browser = None
-
-    def __seeting(self,*args):#设置参数,请求头
-        options = webdriver.ChromeOptions()
-        options.add_argument('disable-infobars')# 不显示提示语句
-        for i in args:
-            if i == '':continue
-            options.add_argument(i)
-        return options
+        self.cookie_dict = {}
+        self.cookie_dict_list = {}#sele的cookies
+        self.lase_func = ''
 
     def strat_urlGet(self,*args,func_cookie):#用get请求url ->得到一个页面信息
         self.break_ = False
         self.page_source_dict = {}
         self.nowurl = self.url.get_url()#获取一个url
         url = self.nowurl.url
-        self.browser = webdriver.Chrome(chrome_options=self.__seeting(*args))
-        self.browser.get(url)
+        if self.nowurl.func == 'get':
+            if self.nowurl.new == True and self.lase_func == 'get':#重新启动
+                self.browser.quit()
+                self.browser = webdriver.Chrome(chrome_options=self.nowurl.options)
+            try:
+                self.browser.get(url)
+            except:
+                self.browser = webdriver.Chrome(chrome_options=self.nowurl.options)
+                self.browser.get(url)
+            try:
+                if self.nowurl.new != True:raise Exception
+                list_ = self.cookie_dict_list[self.nowurl.cookies]
+                self.Tra_cookies()
+                try:
+                    for i in list_:
+                        self.Add_cookies(i)
+                except:pass
+            except:
+                pass
+            self.start_cookies(func_cookie,url)
+        else:
+            try:
+                args = {'cookies':self.cookie_dict[self.nowurl.cookies]}
+                func_cookie([args['cookies']])
+            except:
+                args = {}
+                func_cookie([])
+            if self.nowurl.func == 'post':args['data'] = self.nowurl.data
+            self.browser = self.nowurl.requests(url,headers=self.nowurl.headers,**args)
+            self.cookie_dict[url] = requests.utils.dict_from_cookiejar(self.browser.cookies)#保存cookies
+            func_cookie([self.cookie_dict[url]])
+        self.lase_func = self.nowurl.func
+        self.Parser.browser = self.browser
+        self.Parser.init(url)
+        return self.browser
+
+    def start_cookies(self,func_cookie,url):
         self.break_ = True
         def update_cookie():
             nonlocal self
             while self.break_:
                 try:
-                    func_cookie(self.browser.get_cookies())  # 与GUI通信显示cookie
+                    cookies = self.browser.get_cookies()
+                    func_cookie(cookies)  # 与GUI通信显示cookie
+                    self.cookie_dict[url] = cookies
                     time.sleep(.5)
-                except:pass
+                except:
+                    pass
+
         self.cookie_Thread = threading.Thread(target=update_cookie)
         self.cookie_Thread.start()
-        self.Parser.browser = self.browser
-        self.Parser.init(url)
-        return self.browser
 
     def Del_cookies(self,name):#删除指定cookies
         browser = self.browser
@@ -130,7 +247,7 @@ class Page_Downloader:
         browser = self.browser
         browser.add_cookie(cookies)
 
-    def update_cookies(self,name,cookies:dict,):
+    def update_cookies(self,name,cookies:dict):
         browser = self.browser
         cookies_list = browser.get_cookies()
         for i in cookies_list:
@@ -423,7 +540,10 @@ class Page_Parser:
         @self.add_base
         def action(num,name,*args, **kwargs):
             nonlocal self
-            self.element_dict[f'{name}[{num}]'] = [self.browser.page_source,self.now_url]
+            try:
+                self.element_dict[f'{name}[{num}]'] = [self.browser.page_source,self.now_url]
+            except:
+                self.element_dict[f'{name}[{num}]'] = [self.browser.text, self.now_url]#request
         self.add_func(f'get_page_source', action)
 
     def out_html(self,element_value,**kwargs):#输出网页源码
@@ -433,7 +553,7 @@ class Page_Parser:
             md5 = hashlib.md5()  # 应用MD5算法
             md5.update(f'{time.time()}_{self.now_url}'.encode('utf-8'))
             name = md5.hexdigest()
-            save_dir = self.dir + '/' + name + '.html'
+            save_dir = self.dir + '/' + name + '.cotan_source'
             print(save_dir)
             with open(save_dir,'w') as f:
                 f.write(self.element_dict[element_value][0])

+ 1 - 1
Information_storage.py

@@ -2,7 +2,7 @@ import os
 
 class Database:
     def __init__(self,name):
-        self.dir = rf'{os.getcwd()}/Database_dir/{name}.cotanDB'#创建保存文件
+        self.dir = rf'{os.getcwd()}/Database_dir/{name}.cotanDB'#创建DB文件
         self.file = open(self.dir,'r+' if os.path.exists(self.dir) else 'w+')
         self.id = 0
         self.name = name

+ 67 - 10
Web_Crawler.py

@@ -40,11 +40,50 @@ def Main():
     URL_Input = tkinter.Entry(top, width=width_B * 2)
     URL_Input.grid(column=a_x + 1, row=a_y, columnspan=2, sticky=tkinter.E + tkinter.W)
 
+    global URL_ARGS,UA_Input,use_Cookies_Input,FUNC_Input,DATA_Input
     a_y += 1
-    URL_BOX = tkinter.Listbox(top, width=width_B * 3, height=height_B * 5)
-    URL_BOX.grid(column=a_x, row=a_y, columnspan=3, rowspan=5, sticky=tkinter.E + tkinter.W + tkinter.S + tkinter.N)
+    URL_ARGS = []
+    lable = ['不加载js','不加载java','不加载插件']#复选框
+    for i in range(3):
+        URL_ARGS.append(tkinter.IntVar())
+        tkinter.Checkbutton(top,bg = bg,fg = fg,activebackground=bg,activeforeground=fg,selectcolor=bg, text=lable[i],
+                            variable=URL_ARGS[-1]).grid(column=a_x+i, row=a_y, sticky=tkinter.W)
 
-    a_y += 5
+    a_y += 1
+    lable = ['第一次启动','隐藏网页','不加载图片']#复选框
+    for i in range(3):
+        URL_ARGS.append(tkinter.IntVar())
+        tkinter.Checkbutton(top,bg = bg,fg = fg,activebackground=bg,activeforeground=fg,selectcolor=bg, text=lable[i],
+                            variable=URL_ARGS[-1]).grid(column=a_x+i, row=a_y, sticky=tkinter.W)
+
+    a_y += 1
+    tkinter.Label(top, text='UA设置:', bg=bg, fg=fg, font=FONT, width=width_B, height=height_B).grid(column=a_x,row=a_y)
+    UA_Input = tkinter.Entry(top, width=width_B * 2)
+    UA_Input.grid(column=a_x + 1, row=a_y, columnspan=2, sticky=tkinter.E + tkinter.W)
+
+    a_y += 1
+    tkinter.Label(top, text='DATA:', bg=bg, fg=fg, font=FONT, width=width_B, height=height_B).grid(column=a_x,row=a_y)
+    DATA_Input = tkinter.Entry(top, width=width_B * 2)
+    DATA_Input.grid(column=a_x + 1, row=a_y, columnspan=2, sticky=tkinter.E + tkinter.W)
+
+    a_y += 1
+    tkinter.Label(top, text='请求方式:', bg=bg, fg=fg, font=FONT, width=width_B, height=height_B).grid(column=a_x,row=a_y)
+    FUNC_Input = tkinter.Entry(top, width=width_B * 2)
+    FUNC_Input.grid(column=a_x + 1, row=a_y, columnspan=2, sticky=tkinter.E + tkinter.W)
+
+    a_y += 1
+    tkinter.Label(top, text='Cookies:', bg=bg, fg=fg, font=FONT, width=width_B, height=height_B).grid(column=a_x,row=a_y)
+    use_Cookies_Input = tkinter.Entry(top, width=width_B)
+    use_Cookies_Input.grid(column=a_x + 1, row=a_y, sticky=tkinter.E + tkinter.W)
+    URL_ARGS.append(tkinter.IntVar())
+    tkinter.Checkbutton(top, bg=bg, fg=fg, activebackground=bg, activeforeground=fg, selectcolor=bg, text='新启动网页',
+                        variable=URL_ARGS[-1]).grid(column=a_x + 2, row=a_y, sticky=tkinter.W)
+
+    a_y += 1
+    URL_BOX = tkinter.Listbox(top, width=width_B * 3, height=height_B * 4)
+    URL_BOX.grid(column=a_x, row=a_y, columnspan=3, rowspan=4, sticky=tkinter.E + tkinter.W + tkinter.S + tkinter.N)
+
+    a_y += 4
     tkinter.Button(top, bg=bbg, fg=fg, text='HTTPS过滤器',command=add_filter_func_HTTPS, font=FONT, width=width_B,height=height_B).grid(
         column=a_x, row=a_y, sticky=tkinter.E + tkinter.W)
     tkinter.Button(top, bg=bbg, fg=fg, text='WWW过滤器',command=add_filter_func_WWW, font=FONT, width=width_B,height=height_B).grid(
@@ -60,11 +99,11 @@ def Main():
 
     global Func_BOX,cookies_fixed
     a_y += 1
-    Func_BOX = tkinter.Listbox(top, width=width_B * 3, height=height_B * 4)
-    Func_BOX.grid(column=a_x, row=a_y, columnspan=3, rowspan=4, sticky=tkinter.E + tkinter.W + tkinter.S + tkinter.N)
+    Func_BOX = tkinter.Listbox(top, width=width_B * 3, height=height_B * 3)
+    Func_BOX.grid(column=a_x, row=a_y, columnspan=3, rowspan=3, sticky=tkinter.E + tkinter.W + tkinter.S + tkinter.N)
 
     global wait_Func_BOX,Wait_Input,cookies_BOX
-    a_y += 4
+    a_y += 3
     tkinter.Button(top, bg=bbg, fg=fg, text='单点爬虫运行',command=startDownloader, font=FONT, width=width_B,height=height_B).grid(
         column=a_x, row=a_y, sticky=tkinter.E + tkinter.W)
     tkinter.Button(top, bg=bbg, fg=fg, text='爬虫运行',command=startDownloader, font=FONT, width=width_B,height=height_B).grid(
@@ -81,10 +120,10 @@ def Main():
     cookies_fixed.set('0')
 
     a_y += 1
-    cookies_BOX = tkinter.Listbox(top, width=width_B * 3, height=height_B * 8)
-    cookies_BOX.grid(column=a_x, row=a_y, columnspan=3, rowspan=8, sticky=tkinter.E + tkinter.W + tkinter.S + tkinter.N)
+    cookies_BOX = tkinter.Listbox(top, width=width_B * 3, height=height_B * 4)
+    cookies_BOX.grid(column=a_x, row=a_y, columnspan=3, rowspan=4, sticky=tkinter.E + tkinter.W + tkinter.S + tkinter.N)
 
-    a_y += 8
+    a_y += 4
     tkinter.Button(top, bg=bbg, fg=fg, text='清空曲奇',command=Tra_cookies, font=FONT, width=width_B,height=height_B).grid(
         column=a_x, row=a_y, sticky=tkinter.E + tkinter.W)
     tkinter.Button(top, bg=bbg, fg=fg, text='更新曲奇',command=Update_cookies, font=FONT, width=width_B,height=height_B).grid(
@@ -704,11 +743,29 @@ def del_url():
     url.del_url(index)
     update_URLBOX()
 
+def add_args():
+    global URL_ARGS, UA_Input, use_Cookies_Input,FUNC_Input,DATA_Input
+    try:
+        data = eval(DATA_Input.get(),{})
+    except:
+        data = {}
+    re = dict(
+        func = FUNC_Input.get(),
+        UA = UA_Input.get(),
+        cookies = use_Cookies_Input.get(),
+        data=data
+        )
+    name = ['no_js','no_java','no_plugins','first_run','head','no_img','new']
+    for i in range(len(name)):
+        re[name[i]] = bool(URL_ARGS[i].get())
+    return re
+
 def add_url():
     global URL_Input,url
+    args = add_args()
     new_url = URL_Input.get()
     if new_url == '':return
-    url.add_url(new_url)
+    url.add_url(new_url,**args)
     update_URLBOX()
 
 def update_URLBOX():