Prechádzať zdrojové kódy

初步完成:页面下载器

Huan 5 rokov pred
rodič
commit
b5589a0970
2 zmenil súbory, kde vykonal 402 pridanie a 17 odobranie
  1. 149 16
      Crawler_controller.py
  2. 253 1
      Web_Crawler.py

+ 149 - 16
Crawler_controller.py

@@ -1,40 +1,62 @@
 from selenium import webdriver
+import threading
+import time
+from os.path import exists
+from os import mkdir
+import hashlib
 
-class url:
+
+class url:#url管理器
     num = 0#url处理器个数
-    def __init__(self,dic=f'',dic_run=f'',add_func=lambda url:True,change_url=lambda url:url):
+    def __init__(self,dic=f'',dic_run=f''):
         url.num += 1
-        if dic == '':dic = f'url[{url.num}].cot_url'
-        if dic_run == '':dic = f'url_run[{url.num}].cot_url'
+        dic += f'/url[{url.num}].cot_url'
+        dic_run += f'/url_run[{url.num}].cot_url'
         self.dir = dic
         self.dir_run = dic_run
         self.file = open(dic,'a')#写入url_history的文件
         self.file_run = open(dic_run,'a')#写入已读url文件
         self.url_list = []#待读url
-        self.add_func = add_func#url添加过滤方法
-        self.change_url = change_url#url更正方法
         self.url_history = []#url历史
+        self.filter = {}#过滤函数
+
+    def filter_func(self,url):#url过滤系统
+        for i in self.filter:
+            if not self.filter[i](url): return False
+        return True
+
+    def Add_func(self,func,name):#添加过滤函数
+        self.filter[name] = func
+
+    def Del_func(self,index):#删除过滤函数
+        del self.filter[list(self.filter.keys())[index]]
+
+    def return_func(self):
+        return list(self.filter.keys())
 
-    def add_url(self,url):
-        url = self.change_url(url)#url更正,比如http替换https
-        if url not in self.url_history and self.add_func(url):#1.url不存在历史,2.url满足筛选条件
-            self.url_list.append(url)
-            self.url_history.append(url)
-            self.__out_url(url)
+    def add_url(self,url):#添加url
+        if url not in self.url_history and self.filter_func(url):#1.url不存在历史,2.url满足筛选条件
+            self.url_list.append(url)#添加到待取得url
+            self.url_history.append(url)#添加到历史url
+            self.__out_url(url)#输出历史url
             return True#写入成功
         return False#写入失败
 
-    def get_url(self):
+    def del_url(self,index):#删除url
+        self.__out_url_run(f'DELETE {self.url_list[index]}')
+        del self.url_list[index]
+
+    def get_url(self):#取得url
         url = self.url_list[0]
         self.__out_url_run(url)
         del self.url_list[0]
         return url
 
-    def __out_url(self,url):#输出url
+    def __out_url(self,url):#输出url历史
         self.file.write(f'{url}\n')
         self.file.flush()
 
-    def __out_url_run(self,url):#输出url
+    def __out_url_run(self,url):#输出已经运行的url
         self.file_run.write(f'{url}\n')
         self.file_run.flush()
 
@@ -42,4 +64,115 @@ class url:
         return self.url_list.copy()
 
     def return_url_history(self):
-        return self.url_history.copy()
+        return self.url_history.copy()
+
+class Page_Downloader:
+    num = 0
+    def __init__(self,url:url,dic=''):
+        self.url = url
+        self.dir = dic
+        Page_Downloader.num += 1
+        self.page_source_dict = {}#页面保存信息
+        self.wait = {}#等待函数
+        self.wait_list = []#等待函数的函数名字(执行顺序)
+        self.cookie_Thread = None#子进程
+
+    def Add_func(self,func,name):#添加等待函数
+        name = f'[{len(self.wait)}]{name}'
+        def f(*args,**kwargs):
+            get = func(*args,**kwargs)
+            print(get)
+            try:
+                if get[1] == '':raise Exception
+                return get#save和name
+            except:
+                return False,''
+        self.wait_list.append(name)
+        self.wait[name] = f
+
+    def Del_func(self,index):#删除等待函数
+        del self.wait[list(self.wait.keys())[index]]
+
+    def return_func(self):
+        return list(self.wait.keys())
+
+    def __seeting(self,*args):#设置参数,请求头
+        options = webdriver.ChromeOptions()
+        options.add_argument('disable-infobars')# 不显示提示语句
+        for i in args:
+            if i == '':continue
+            options.add_argument(i)
+        return options
+
+    def strat_urlGet(self,*args):#用get请求url ->得到一个页面信息
+        self.break_ = False
+        self.page_source_dict = {}
+        self.nowurl = self.url.get_url()#获取一个url
+        url = self.nowurl
+        self.browser = webdriver.Chrome(chrome_options=self.__seeting(*args))
+        self.browser.get(url)
+        return self.browser
+
+    def Logical_operation(self,func_cookie=lambda x:None,func_page=lambda x:None):#执行等待策略
+        browser = self.browser
+        self.page_source_dict['FIRST_PAGE'] = browser.page_source#记录最先的PAGE源代码
+        func_page(list(self.page_source_dict.keys()))
+        self.break_ = True
+        def update_cookie():
+            nonlocal self
+            while self.break_:
+                try:
+                    func_cookie(self.browser.get_cookies())  # 与GUI通信显示cookie
+                    time.sleep(1)
+                except:pass
+        self.cookie_Thread = threading.Thread(target=update_cookie)
+        self.cookie_Thread.start()
+        for i in self.wait_list:
+            save,name = self.wait[i](browser)
+            if save:
+                print(save)
+                self.page_source_dict[name] = browser.page_source
+            else:
+                print(save)
+            func_page(list(self.page_source_dict.keys()))
+        self.page_source_dict['LAST_PAGE'] = browser.page_source#记录最后的PAGE源代码
+        func_page(list(self.page_source_dict.keys()))
+
+    def save_Page(self):#保存网页
+        dic = self.dir + f'/Page_{hashlib.md5(self.nowurl.encode("utf8")).hexdigest()}'#通过计算哈希保存页面
+        a = 0
+        new_dir = ''
+        while exists(dic):
+            new_dir = dic + f'[{a}]'
+            a += 1
+        if new_dir == '':new_dir = dic
+        mkdir(new_dir)
+        with open(f'{new_dir}/url', 'w') as f:
+            f.write(self.nowurl)
+        for i in self.page_source_dict:
+            with open(f'{new_dir}/{i}.html','w') as f:
+                f.write(str(self.page_source_dict[i]))
+        return None
+
+    def Del_cookies(self,name):#删除指定cookies
+        browser = self.browser
+        browser.delete_cookie(name)
+
+    def Tra_cookies(self):#清空cookies
+        browser = self.browser
+        browser.delete_all_cookies()
+
+    def Add_cookies(self,cookies:dict):#清空cookies
+        browser = self.browser
+        browser.add_cookie(cookies)
+
+    def update_cookies(self,name,cookies:dict,):
+        browser = self.browser
+        cookies_list = browser.get_cookies()
+        for i in cookies_list:
+            if i.get('name',None) == name:
+                browser.delete_cookie(name)#删除原来cookies
+                i.update(cookies)
+                browser.add_cookie(i)
+                return
+        raise Exception

+ 253 - 1
Web_Crawler.py

@@ -1 +1,253 @@
-from selenium import webdriver
+import Crawler_controller
+import os
+import tkinter
+from tkinter.filedialog import askdirectory
+import re
+import threading
+import time
+
+def Main():
+    global top,Git,PATH,bg,bbg,fg,cookies_list
+    PATH = os.getcwd()
+    top = tkinter.Tk()
+    cookies_list = []
+    bg = '#FFFAFA'  # 主颜色
+    bbg = '#FFFAFA'  # 按钮颜色
+    fg = '#000000'  # 文字颜色
+    top["bg"] = bg
+    FONT = ('黑体', 11)  # 设置字体
+    top.title('CoTan仓库管理器')
+    top.resizable(width=False, height=False)
+    top.geometry('+10+10')  # 设置所在位置
+
+    width_B = 13  # 标准宽度
+    height_B = 2
+    a_y = 0
+    a_x = 0
+
+    tkinter.Button(top, bg=bbg, fg=fg, text='添加url',command=add_url , font=FONT, width=width_B,
+                   height=height_B).grid(column=a_x, row=a_y, sticky=tkinter.E + tkinter.W)
+    tkinter.Button(top, bg=bbg, fg=fg, text='删除url',command=del_url , font=FONT, width=width_B,
+                   height=height_B).grid(column=a_x+1, row=a_y, sticky=tkinter.E + tkinter.W)
+    tkinter.Button(top, bg=bbg, fg=fg, text='应用过滤机制', font=FONT, width=width_B,
+                   height=height_B).grid(column=a_x+2, row=a_y, sticky=tkinter.E + tkinter.W)
+
+    global URL_BOX,URL_Input,Func_BOX
+    a_y += 1
+    tkinter.Label(top, text='添加url:', bg=bg, fg=fg, font=FONT, width=width_B, height=height_B).grid(column=a_x,row=a_y)
+    URL_Input = tkinter.Entry(top, width=width_B * 2)
+    URL_Input.grid(column=a_x + 1, row=a_y, columnspan=2, sticky=tkinter.E + tkinter.W)
+
+    a_y += 1
+    URL_BOX = tkinter.Listbox(top, width=width_B * 3, height=height_B * 3)
+    URL_BOX.grid(column=a_x, row=a_y, columnspan=3, rowspan=3, sticky=tkinter.E + tkinter.W + tkinter.S + tkinter.N)
+
+    a_y += 3
+    tkinter.Button(top, bg=bbg, fg=fg, text='HTTPS过滤',command=add_filter_func_HTTPS, font=FONT, width=width_B,height=height_B).grid(
+        column=a_x, row=a_y, sticky=tkinter.E + tkinter.W)
+    tkinter.Button(top, bg=bbg, fg=fg, text='WWW过滤',command=add_filter_func_WWW, font=FONT, width=width_B,height=height_B).grid(
+        column=a_x+1, row=a_y, sticky=tkinter.E + tkinter.W)
+    tkinter.Button(top, bg=bbg, fg=fg, text='删除过滤',command=del_func, font=FONT, width=width_B,height=height_B).grid(
+        column=a_x+2, row=a_y, sticky=tkinter.E + tkinter.W)
+
+    a_y += 1
+    tkinter.Button(top, bg=bbg, fg=fg, text='自定义过滤',command=add_filter_func_HTTPS, font=FONT, width=width_B,height=height_B).grid(
+        column=a_x, row=a_y,columnspan=2, sticky=tkinter.E + tkinter.W)
+    tkinter.Button(top, bg=bbg, fg=fg, text='清空过滤', font=FONT, width=width_B,height=height_B).grid(
+        column=a_x+2, row=a_y, sticky=tkinter.E + tkinter.W)
+
+    global Func_BOX,cookies_fixed
+    a_y += 1
+    Func_BOX = tkinter.Listbox(top, width=width_B * 3, height=height_B * 2)
+    Func_BOX.grid(column=a_x, row=a_y, columnspan=3, rowspan=2, sticky=tkinter.E + tkinter.W + tkinter.S + tkinter.N)
+
+    global wait_Func_BOX,Wait_Input,cookies_BOX
+
+    a_y += 2
+    tkinter.Button(top, bg=bbg, fg=fg, text='执行网页下载',command=startDownloader, font=FONT, width=width_B,height=height_B).grid(
+        column=a_x, row=a_y, sticky=tkinter.E + tkinter.W)
+    tkinter.Button(top, bg=bbg, fg=fg, text='显式等待',command=add_time_wait, font=FONT, width=width_B,height=height_B).grid(
+        column=a_x+1, row=a_y, sticky=tkinter.E + tkinter.W)
+    Wait_Input = tkinter.Entry(top, width=width_B)
+    Wait_Input.grid(column=a_x + 2, row=a_y, sticky=tkinter.E + tkinter.W)
+
+    a_y += 3
+    tkinter.Button(top, bg=bbg, fg=fg, text='自定义等待策略',command=add_filter_func_HTTPS, font=FONT, width=width_B,height=height_B).grid(
+        column=a_x, row=a_y, sticky=tkinter.E + tkinter.W)
+    tkinter.Button(top, bg=bbg, fg=fg, text='删除等待策略',command=del_waitfunc, font=FONT, width=width_B,height=height_B).grid(
+        column=a_x+1, row=a_y, sticky=tkinter.E + tkinter.W)
+    tkinter.Button(top, bg=bbg, fg=fg, text='清空等待策略',command=del_func, font=FONT, width=width_B,height=height_B).grid(
+        column=a_x+2, row=a_y, sticky=tkinter.E + tkinter.W)
+
+    a_y += 1
+    wait_Func_BOX = tkinter.Listbox(top, width=width_B * 3, height=height_B * 2)
+    wait_Func_BOX.grid(column=a_x, row=a_y, columnspan=3, rowspan=2, sticky=tkinter.E + tkinter.W + tkinter.S + tkinter.N)
+
+    a_y += 2
+    cookies_fixed = tkinter.Variable()
+    tkinter.Label(top, text='【曲奇监视】', bg=bg, fg=fg, font=FONT, width=width_B, height=height_B).grid(
+        column=a_x+1,row=a_y,sticky=tkinter.E + tkinter.W + tkinter.W + tkinter.S + tkinter.N)  # 设置说明
+    tkinter.Checkbutton(top, bg=bg, fg=fg, activebackground=bg, activeforeground=fg, selectcolor=bg, text='固定曲奇',
+                        variable=cookies_fixed).grid(column=a_x + 2, row=a_y, sticky=tkinter.W)
+    cookies_fixed.set('0')
+
+    a_y += 1
+    cookies_BOX = tkinter.Listbox(top, width=width_B * 3, height=height_B * 2)
+    cookies_BOX.grid(column=a_x, row=a_y, columnspan=3, rowspan=2, sticky=tkinter.E + tkinter.W + tkinter.S + tkinter.N)
+
+    a_y += 2
+    tkinter.Button(top, bg=bbg, fg=fg, text='清空曲奇',command=Tra_cookies, font=FONT, width=width_B,height=height_B).grid(
+        column=a_x, row=a_y, sticky=tkinter.E + tkinter.W)
+    tkinter.Button(top, bg=bbg, fg=fg, text='更新曲奇',command=Update_cookies, font=FONT, width=width_B,height=height_B).grid(
+        column=a_x+1, row=a_y, sticky=tkinter.E + tkinter.W)
+    tkinter.Button(top, bg=bbg, fg=fg, text='删除曲奇',command=Del_cookies, font=FONT, width=width_B,height=height_B).grid(
+        column=a_x+2, row=a_y, sticky=tkinter.E + tkinter.W)
+
+    global cookies_Input,PAGE_BOX
+    a_y += 1
+    cookies_Input = tkinter.Entry(top, width=width_B * 2)
+    cookies_Input.grid(column=a_x, row=a_y, columnspan=2, sticky=tkinter.E + tkinter.W)
+    tkinter.Button(top, bg=bbg, fg=fg, text='添加曲奇',command=Add_cookies, font=FONT, width=width_B,height=height_B).grid(
+        column=a_x+2, row=a_y, sticky=tkinter.E + tkinter.W)
+
+    a_y += 1
+    cookies_fixed = tkinter.Variable()
+    tkinter.Label(top, text='【已存储页面】', bg=bg, fg=fg, font=FONT, width=width_B, height=height_B).grid(
+        column=a_x,row=a_y,columnspan=3,sticky=tkinter.E + tkinter.W + tkinter.W + tkinter.S + tkinter.N)
+
+    a_y += 1
+    PAGE_BOX = tkinter.Listbox(top, width=width_B * 3, height=height_B * 2)
+    PAGE_BOX.grid(column=a_x, row=a_y, columnspan=3, rowspan=2, sticky=tkinter.E + tkinter.W + tkinter.S + tkinter.N)
+
+    top.update()#要预先update一下,否则会卡住
+    global url,loader
+    save_dir = askdirectory(title='选择项目位置')#项目位置
+    url = Crawler_controller.url(save_dir,save_dir)
+    loader = Crawler_controller.Page_Downloader(url,save_dir)
+    top.mainloop()
+
+def PAGE_BOX_Update(PAGE_list):
+    global PAGE_BOX
+    PAGE_BOX.delete(0,tkinter.END)
+    PAGE_BOX.insert(0,*PAGE_list)
+
+def Update_cookies():
+    global cookies_BOX,cookies_list,cookies_Input
+    cookies = eval(cookies_Input.get(),{})
+    if cookies_fixed.get() == '0':return False
+    try:
+        name = cookies_list[cookies_BOX.curselection()[0]].get('name')
+        loader.update_cookies(name,cookies)
+        cookies_fixed.set('0')
+    except:
+        pass
+
+def Add_cookies():
+    global cookies_BOX,cookies_list,cookies_Input
+    cookies = eval(cookies_Input.get(),{})
+    if cookies_fixed.get() == '0':return False
+    try:
+        loader.Add_cookies(cookies)
+        cookies_fixed.set('0')
+    except:
+        raise
+
+def Tra_cookies():
+    global cookies_BOX,cookies_list
+    if cookies_fixed.get() == '0':return False
+    try:
+        loader.Tra_cookies()
+        cookies_fixed.set('0')
+    except:
+        pass
+
+def Del_cookies():
+    global cookies_BOX,cookies_list
+    if cookies_fixed.get() == '0':return False
+    try:
+        name = cookies_list[cookies_BOX.curselection()[0]].get('name')
+        print(name)
+        loader.Del_cookies(name)
+        cookies_fixed.set('0')
+    except:
+        pass
+
+def cookies_BOX_Update(cookies):
+    global cookies_BOX,cookies_list
+    if cookies_fixed.get() == '0':
+        cookies_list = cookies
+        cookies_BOX.delete(0,tkinter.END)
+        cookies_BOX.insert(0,*cookies)
+
+def add_time_wait():#显式等待一定s
+    global url,Wait_Input
+    times = float(Wait_Input.get())
+    def wait_time(*args):
+        time.sleep(times)
+        return True,f'After_{time}s'
+    loader.Add_func(wait_time,f'wait {times}s')
+    update_Wait_Input()
+
+def del_waitfunc():#删除策略
+    global wait_Func_BOX
+    index = wait_Func_BOX.curselection()[0]
+    loader.Del_func(index)
+    update_Wait_Input()
+
+def update_Wait_Input():
+    global loader,wait_Func_BOX
+    wait_Func_BOX.delete(0,tkinter.END)
+    wait_Func_BOX.insert(tkinter.END,*loader.return_func())
+
+def startDownloader():
+    def startLoader():
+        global loader
+        loader.strat_urlGet()
+        loader.Logical_operation(cookies_BOX_Update,PAGE_BOX_Update)
+        loader.save_Page()
+    new = threading.Thread(target=startLoader)
+    new.start()
+    update_URLBOX()
+
+def add_filter_func_HTTPS():
+    global url
+    url.Add_func(lambda url:re.match(re.compile('^https://'),url),'HTTPS过滤')
+    update_Func_BOX()
+
+def add_filter_func_WWW():
+    global url
+    url.Add_func(lambda url:re.match(re.compile('.*www\.'),url),'www过滤')
+    update_Func_BOX()
+
+def del_func():
+    global URL_BOX
+    index = Func_BOX.curselection()[0]
+    url.Del_func(index)
+    update_Func_BOX()
+
+def update_Func_BOX():
+    global url,Func_BOX
+    Func_BOX.delete(0,tkinter.END)
+    Func_BOX.insert(tkinter.END,*url.return_func())
+
+def del_url():
+    global URL_BOX
+    index = URL_BOX.curselection()[0]
+    url.del_url(index)
+    update_URLBOX()
+
+def add_url():
+    global URL_Input,url
+    new_url = URL_Input.get()
+    if new_url == '':return
+    url.add_url(new_url)
+    update_URLBOX()
+
+def update_URLBOX():
+    global url,URL_BOX
+    URL_BOX.delete(0,tkinter.END)
+    URL_BOX.insert(tkinter.END,*url.return_url())
+
+
+if __name__ == "__main__":
+    Main()