Pārlūkot izejas kodu

初步完成:url管理器

Huan 5 gadi atpakaļ
vecāks
revīzija
6d83dd75d6
3 mainītis faili ar 46 papildinājumiem un 0 dzēšanām
  1. 45 0
      Crawler_controller.py
  2. 0 0
      Information_storage.py
  3. 1 0
      Web_Crawler.py

+ 45 - 0
Crawler_controller.py

@@ -0,0 +1,45 @@
+from selenium import webdriver
+
+class url:
+    num = 0#url处理器个数
+    def __init__(self,dic=f'',dic_run=f'',add_func=lambda url:True,change_url=lambda url:url):
+        url.num += 1
+        if dic == '':dic = f'url[{url.num}].cot_url'
+        if dic_run == '':dic = f'url_run[{url.num}].cot_url'
+        self.dir = dic
+        self.dir_run = dic_run
+        self.file = open(dic,'a')#写入url_history的文件
+        self.file_run = open(dic_run,'a')#写入已读url文件
+        self.url_list = []#待读url
+        self.add_func = add_func#url添加过滤方法
+        self.change_url = change_url#url更正方法
+        self.url_history = []#url历史
+
+    def add_url(self,url):
+        url = self.change_url(url)#url更正,比如http替换https
+        if url not in self.url_history and self.add_func(url):#1.url不存在历史,2.url满足筛选条件
+            self.url_list.append(url)
+            self.url_history.append(url)
+            self.__out_url(url)
+            return True#写入成功
+        return False#写入失败
+
+    def get_url(self):
+        url = self.url_list[0]
+        self.__out_url_run(url)
+        del self.url_list[0]
+        return url
+
+    def __out_url(self,url):#输出url
+        self.file.write(f'{url}\n')
+        self.file.flush()
+
+    def __out_url_run(self,url):#输出url
+        self.file_run.write(f'{url}\n')
+        self.file_run.flush()
+
+    def return_url(self):
+        return self.url_list.copy()
+
+    def return_url_history(self):
+        return self.url_history.copy()

+ 0 - 0
Information_storage.py


+ 1 - 0
Web_Crawler.py

@@ -0,0 +1 @@
+from selenium import webdriver