main.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391
  1. from abc import ABCMeta, abstractmethod
  2. import requests
  3. from bs4 import BeautifulSoup
  4. import time
  5. import webbrowser
  6. class Search(metaclass=ABCMeta):
  7. def __init__(self):
  8. self.url = ""
  9. self.args = ""
  10. self.bd_session = requests.Session()
  11. self.report = None
  12. self.bs4: BeautifulSoup = None
  13. self.word_list = []
  14. self.url_dict = {}
  15. self.page_num = 0
  16. self.referer = ""
  17. self.headers = {
  18. 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
  19. 'accept-language': 'zh-CN,zh;q=0.9',
  20. 'cache-control': 'max-age=0',
  21. 'sec-fetch-dest': 'document',
  22. 'sec-fetch-mode': 'navigate',
  23. 'sec-fetch-site': 'none',
  24. 'sec-fetch-user': '?1',
  25. 'connection': 'close',
  26. 'upgrade-insecure-requests': '1',
  27. 'accept-encoding': 'gzip, deflate',
  28. "content-type": "application/x-www-form-urlencoded",
  29. "Upgrade-Insecure-Requests": "1",
  30. 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360SE',
  31. }
  32. @abstractmethod
  33. def get_report(self, args_list, start):
  34. pass
  35. def bs_paser(self) -> None:
  36. assert self.report, "Don't get report"
  37. self.bs4 = BeautifulSoup(self.report, 'html.parser')
  38. @abstractmethod
  39. def find_word(self):
  40. pass
  41. @abstractmethod
  42. def __iter__(self):
  43. pass
  44. @abstractmethod
  45. def __next__(self):
  46. pass
  47. def output_word(self):
  48. return self.word_list
  49. def return_page(self):
  50. return self.page_num
  51. class BingWeb(Search):
  52. def __init__(self):
  53. super().__init__()
  54. self.url = "https://cn.bing.com"
  55. self.headers["Origin"] = "https://cn.bing.com"
  56. self.headers['host'] = 'cn.bing.com'
  57. def get_report(self, args_list=None, start=True):
  58. if args_list:
  59. self.args = "?" + "q=" + args_list
  60. if start:
  61. self.page_num = 0
  62. if self.referer:
  63. self.headers["referer"] = self.referer
  64. self.referer = self.url + self.args
  65. self.report = self.bd_session.get(self.referer, headers=self.headers).text
  66. self.bs_paser()
  67. return self
  68. def find_word(self) -> None:
  69. self.word_list = []
  70. # bing 特色搜索
  71. word = self.bs4.find_all("li", class_="b_ans") # bing 词典(dict_oa), bing 视频(vsa)
  72. for w in word:
  73. dict_oa = w.find("div", class_="dict_oa")
  74. vsa = w.find("div", class_="vsa") # bing 视频
  75. try: # 错误捕捉
  76. if dict_oa: # 找到了dict_oa,是词典模式
  77. self.append_word_list("[bing词典]" + dict_oa.div.div.h2.a.text,
  78. self.url + dict_oa.div.div.h2.a.get("href"))
  79. elif vsa: # 视频模式
  80. self.append_word_list("[bing视频]" + vsa.h2.a.text,
  81. self.url + vsa.h2.a.get("href"))
  82. pass
  83. except AttributeError:
  84. pass
  85. word = self.bs4.find_all("li", class_="b_ans b_mop b_imgans b_imgsmall") # bing 图片
  86. for w in word:
  87. irphead = w.find("div", class_="irphead")
  88. try: # 错误捕捉
  89. if irphead: # 找到了dict_oa,是词典模式
  90. self.append_word_list("[bing图片]" + irphead.h2.a.text,
  91. self.url + irphead.h2.a.get("href"))
  92. except AttributeError:
  93. pass
  94. word = self.bs4.find_all("li", class_="b_algo") # b_algo是普通词条或者官网(通过b_title鉴别)
  95. for w in word:
  96. title = w.find("div", class_="b_title")
  97. try: # 错误捕捉
  98. if title: # 找到了title(官网模式)
  99. self.append_word_list(title.h2.a.text, title.h2.a.get("href"))
  100. else: # 普通词条模式
  101. self.append_word_list(w.h2.a.text, w.h2.a.get("href"))
  102. except AttributeError:
  103. pass
  104. def append_word_list(self, title, url): # 过滤重复并且压入url_list
  105. if not self.url_dict.get(url, None):
  106. self.url_dict[url] = title
  107. self.word_list.append((title, url))
  108. def __iter__(self):
  109. self.page_num = -1
  110. return self
  111. def __next__(self) -> bool:
  112. if self.page_num == -1: # 默认的第一次get
  113. self.page_num += 1
  114. return True
  115. self.page_num += 1
  116. title = self.bs4.find("a", title=f"下一页")
  117. if title:
  118. self.args = title.get("href")
  119. self.report = self.get_report(None, False)
  120. else:
  121. raise StopIteration
  122. return True
  123. class BaiduWeb(Search):
  124. def __init__(self):
  125. super().__init__()
  126. self.url = "https://www.baidu.com"
  127. self.headers["Origin"] = "https://www.baidu.com"
  128. self.headers['host'] = 'www.baidu.com'
  129. def get_report(self, args_list=None, start=True):
  130. if args_list:
  131. self.args = "/s?" + "wd=" + args_list
  132. if start:
  133. self.page_num = 0
  134. if self.referer:
  135. self.headers["referer"] = self.referer
  136. self.referer = self.url + self.args
  137. self.report = self.bd_session.get(self.referer, headers=self.headers).text
  138. self.bs_paser()
  139. return self
  140. def find_word(self) -> None:
  141. self.word_list = []
  142. # 百度特色搜索
  143. word = self.bs4.find_all("div", class_="result-op c-container xpath-log", tpl="bk_polysemy") # 百度百科
  144. for w in word:
  145. try: # 错误捕捉
  146. self.append_word_list("[百度百科]" + str(w.h3.a.text).replace("\n", ""), w.h3.a.get("href"))
  147. except AttributeError:
  148. pass
  149. word = self.bs4.find_all("div", class_="result c-container")
  150. for w in word:
  151. try: # 错误捕捉
  152. self.append_word_list(w.h3.a.text, w.h3.a.get("href"))
  153. except AttributeError:
  154. pass
  155. word = self.bs4.find_all("div", class_="result-op c-container") # 特殊词条
  156. for w in word:
  157. # c-result-content
  158. try: # 错误捕捉
  159. title = w.find("div", class_="c-result-content").find("section") # 特殊词条
  160. self.append_word_list(title.a.h3.span.text, title.a.get("href"))
  161. except AttributeError:
  162. pass
  163. def append_word_list(self, title, url): # 过滤重复并且压入url_list
  164. # try:
  165. # new_url = requests.get(url, headers=self.headers, timeout=5)
  166. # print(new_url.status_code) # 打印响应的状态码
  167. # url = new_url.url
  168. # print(url)
  169. # except:
  170. # pass
  171. if not self.url_dict.get(url, None):
  172. self.url_dict[url] = title
  173. self.word_list.append((title, url))
  174. def __iter__(self):
  175. self.page_num = -1
  176. return self
  177. def __next__(self) -> bool:
  178. if self.page_num == -1: # 默认的第一次get
  179. self.page_num += 1
  180. return True
  181. self.page_num += 1
  182. page = self.bs4.find("div", id="page")
  183. if not page:
  184. raise StopIteration
  185. next_page_list = self.bs4.find_all("a", class_=f"n")
  186. if next_page_list:
  187. next_page = next_page_list[-1]
  188. if not str(next_page.text).startswith("下一页"):
  189. raise StopIteration
  190. self.args = next_page.get("href")
  191. self.report = self.get_report(None, False)
  192. else:
  193. raise StopIteration
  194. return True
  195. class Seacher: # 搜索者
  196. def __init__(self, word: str):
  197. self.web = {"bing": BingWeb(), "baidu": BaiduWeb()}
  198. self.word = word
  199. self.first = True
  200. self.old_return_str = ""
  201. self.web_name_dict = {} # 同名网站处理
  202. self.url_list = []
  203. def find(self):
  204. for web_name in self.web:
  205. web = self.web[web_name]
  206. web.get_report(self.word).__iter__() # 做好迭代的准备
  207. return self
  208. def __iter__(self):
  209. self.first = True
  210. return self
  211. def __next__(self):
  212. if not self.first:
  213. time.sleep(1)
  214. # 使用了menu之后不需要is_next了
  215. # if not self.is_next():
  216. # raise StopIteration
  217. else:
  218. self.first = False
  219. return_str = ""
  220. for web_name in self.web:
  221. web = self.web[web_name]
  222. try:
  223. web.__next__()
  224. except StopIteration:
  225. pass
  226. else:
  227. web.find_word()
  228. get: list = web.output_word()
  229. return_str += "\n" + "* " * 20 + f"\n{web.return_page()}: [{web_name}] for {self.word} >>>\n"
  230. for i in get:
  231. if self.web_name_dict.get(i[0], None):
  232. return_str += f"[{len(self.url_list)}][曾经出现过 {self.web_name_dict[i[0]]}] {i[0]}\n{' ' * 8}-> {i[1]}\n"
  233. self.url_list.append(i[1])
  234. else:
  235. return_str += f"[{len(self.url_list)}]{i[0]}\n{' ' * 8}-> {i[1]}\n"
  236. self.web_name_dict[i[0]] = f"{web_name}, page: {web.return_page()}, [{len(self.url_list)}]"
  237. self.url_list.append(i[1])
  238. return_str += "* " * 20 + "\n"
  239. self.old_return_str = return_str
  240. return return_str
  241. def out_again(self): # 再输出一次
  242. return self.old_return_str
  243. def open_url(self, num) -> None: # 再输出一次
  244. try:
  245. url = self.url_list[num]
  246. except IndexError: # 太大了
  247. return None
  248. webbrowser.open_new_tab(url)
  249. time.sleep(3)
  250. @staticmethod
  251. def is_next():
  252. return input("next? [Y/n]") != "n"
  253. class Menu:
  254. def __init__(self):
  255. self.searcher_dict = {}
  256. self.searcher_dict_old = {}
  257. print("Welcome To SSearch!")
  258. def menu(self) -> None:
  259. while True:
  260. try:
  261. if not self.__menu():
  262. break
  263. except KeyboardInterrupt:
  264. print("\n", end="")
  265. except BaseException as e:
  266. print(f"There are some Error:\n{e}\n")
  267. def __menu(self): # 注: self是有作用的(exec)
  268. try:
  269. command = input(f'[\033[4mSSearch\033[0m] > ') # 输入一条指令
  270. except KeyboardInterrupt:
  271. print("\nPlease Enter 'quit' or 'q' to quit")
  272. return True
  273. if command == "q" or command == "quit":
  274. print("SSearch: Bye Bye!")
  275. return False # 结束
  276. try:
  277. exec(f"self.func_{command}()")
  278. except AttributeError:
  279. print("Not Support Command. [help]")
  280. return True
  281. def func_make(self):
  282. word = input("输入关键词:")
  283. name = input(f"输入名字[默认={word}]:")
  284. if not name:
  285. name = word
  286. self.searcher_dict[name] = Seacher(word) # 制造一个搜索器
  287. self.searcher_dict[name].find().__iter__() # 迭代准备
  288. self.func_next(name, True)
  289. def func_again(self, name=None):
  290. if not name:
  291. name = input(f"输入名字:")
  292. seacher_iter = self.searcher_dict.get(name, None)
  293. if not seacher_iter:
  294. print("没有找到对应搜索器或搜索器已经搜索结束")
  295. else:
  296. print(seacher_iter.out_again())
  297. def func_open(self):
  298. name = input(f"输入名字:")
  299. try:
  300. num = int(input("输入代号:"))
  301. except ValueError:
  302. print("请输入数字代号")
  303. return
  304. seacher_iter = self.searcher_dict.get(name, None)
  305. seacher_iter_old = self.searcher_dict_old.get(name, None)
  306. if seacher_iter:
  307. seacher_iter.open_url(num)
  308. elif seacher_iter_old:
  309. seacher_iter_old.open_url(num)
  310. else:
  311. print("没有找到对应搜索器或搜索器已经搜索结束")
  312. def func_next(self, name=None, first=False):
  313. if not name:
  314. name = input(f"输入名字:")
  315. if not first:
  316. self.func_again(name)
  317. seacher_iter = self.searcher_dict.get(name, None)
  318. if not seacher_iter:
  319. print("没有找到对应搜索器或搜索器已经搜索结束")
  320. else:
  321. try:
  322. if first: # make的时候需要输出
  323. out = seacher_iter.__next__()
  324. print(out)
  325. seacher_iter.__next__() # 储备输出
  326. except StopIteration:
  327. self.func_again(name) # 输出最后的结果
  328. self.searcher_dict_old[name] = self.searcher_dict[name]
  329. del self.searcher_dict[name] # 删除输出
  330. print(f"{name}: [搜索结束]")
  331. except AttributeError as e:
  332. print(f"There are some Error:\n{e}\n")
  333. if __name__ == "__main__":
  334. menu = Menu()
  335. menu.menu()