word.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170
  1. """
  2. 文件名: word.py
  3. 通过检索网络词典获得一个词语的中文翻译
  4. 使用requests生成一个网络请求
  5. 使用bs4解析网页
  6. """
  7. import logging
  8. import requests
  9. from bs4 import BeautifulSoup
  10. from typing import Optional, Dict
  11. from core.aliyun import tls
  12. class WordDict:
  13. __data_set_search = "english-chinese-simplified"
  14. __url = f"https://dictionary.cambridge.org/zhs/%E6%90%9C%E7%B4%A2/direct/?datasetsearch={__data_set_search}"
  15. __logger = logging.getLogger("dict")
  16. __logger.propagate = False
  17. def __init__(self, user_agent: Optional[str] = None, proxies: Optional[dict] = None):
  18. if user_agent is None:
  19. user_agent = ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) "
  20. "Chrome/17.0.963.56 Safari/535.11 ")
  21. if proxies is None:
  22. proxies = {'http': "http://localhost:8889", 'https': "http://localhost:8889"} # 不走系统代理
  23. self.headers = {"User-Agent": user_agent} # 配置请求头
  24. self.proxies = proxies
  25. def set_headers(self, headers: dict):
  26. self.headers.update(headers)
  27. def set_proxies(self, proxies: dict):
  28. self.proxies.update(proxies)
  29. def get_requests(self, q: str) -> "Response":
  30. response = requests.get(self.__url,
  31. params={"q": q},
  32. headers=self.headers,
  33. proxies=self.proxies)
  34. self.__logger.info(f"Get requests params: {q} url: {response.url} status: {response.status_code}")
  35. return Response(response)
  36. class Response:
  37. __logger = logging.getLogger("dict.response")
  38. __logger.propagate = False
  39. def __init__(self, response: requests.Response):
  40. self._res = response
  41. self._soup = None
  42. if self._res.status_code != 200:
  43. self.__logger.debug(f"Response bad status : {self._res.status_code}")
  44. return
  45. self._soup = BeautifulSoup(self._res.text, "html.parser")
  46. self.di_body = self._soup.find(name="div", attrs={"class": "di-body"})
  47. if self.di_body is None:
  48. self._soup = None
  49. self.__logger.debug(f"Response bad syntax url: {self._res.url}")
  50. return
  51. self.entry = [(i, 1) for i in self.di_body.findAll(name="div", attrs={"class": "pr entry-body__el"})]
  52. self.entry += [(i, 2) for i in self.di_body.findAll(name="div", attrs={"class": "pv-block"})]
  53. if len(self.entry) == 0:
  54. self._soup = None
  55. self.__logger.debug(f"Response bad syntax url: {self._res.url}")
  56. return
  57. self.res: Dict[str: Word] = {}
  58. for i, f in self.entry:
  59. name = i.find(name="div", attrs={"class": "di-title"})
  60. if name is None:
  61. continue
  62. if f == 1:
  63. name_string = str(name.span.span.text)
  64. part = i.find(name="div", attrs={"class": "posgram dpos-g hdib lmr-5"})
  65. if part is None:
  66. part_string = "unknown"
  67. else:
  68. part_string = str(part.span.text)
  69. else:
  70. name_string = str(name.h2.b.text)
  71. part = i.find(name="div", attrs={"class": "pos dpos"})
  72. if part is None:
  73. part_string = "unknown"
  74. else:
  75. part_string = str(part.span.text)
  76. word = self.res.get(name_string)
  77. if word is None:
  78. tls_res = tls.start(name_string)
  79. if not tls_res.success:
  80. continue
  81. word = Word(name_string, tls_res.mp3)
  82. self.res[name_string] = word
  83. h = i.find(name="div", attrs={"class": "ddef_h"})
  84. if h is None:
  85. continue
  86. english = str(h.div.text)
  87. b = i.find(name="div", attrs={"class": "def-body ddef_b"})
  88. if b is None:
  89. continue
  90. chinese = str(b.span.text)
  91. comment = Word.Comment(part_string, english, chinese)
  92. eg = b.findAll(name="div", attrs={"class": "examp dexamp"})
  93. for e in eg:
  94. es = e.find(name="span", attrs={"class": "eg deg"})
  95. cs = e.find(name="span", attrs={"class": "trans dtrans dtrans-se hdb break-cj"})
  96. if es is None:
  97. continue
  98. es = str(es.text).replace("##", " ").replace("@@", " ")
  99. if cs is not None:
  100. cs = str(cs.text).replace("##", " ").replace("@@", " ")
  101. else:
  102. cs = ""
  103. comment.add_eg(f"{es}##{cs}")
  104. word.add_comment(comment)
  105. if len(self.res) == 0:
  106. self._soup = None
  107. self.__logger.debug(f"Response bad word: {self._res.url}")
  108. @property
  109. def is_find(self):
  110. return self._soup is not None
  111. class Word:
  112. class Comment:
  113. def __init__(self, part: str, english: str, chinese: str):
  114. self.part = part # 词性
  115. self.english = english
  116. self.chinese = chinese
  117. self.eg = []
  118. def add_eg(self, eg: str):
  119. eg = eg.strip()
  120. if eg == "##" or len(eg) == 0:
  121. return
  122. self.eg.append(eg)
  123. def __str__(self):
  124. return f"{self.part} {self.english} {self.chinese} \neg: {self.eg}"
  125. def __init__(self, name: str, mp3: str, box: int = 1):
  126. self.name = name
  127. self.comment: Dict[str: "Word.Comment"] = {} # 注释
  128. if box < 1 or box > 5:
  129. box = 1
  130. self.box = box
  131. self.mp3 = mp3
  132. def add_comment(self, c: Comment):
  133. if self.comment.get(c.english) is None:
  134. self.comment[c.english] = c
  135. def set_box(self, box: int):
  136. if 0 < box < 6:
  137. self.box = box
  138. def __str__(self):
  139. ret = f"{self.name}:\n"
  140. for i in self.comment:
  141. ret += f'note: {self.comment[i]};\n'
  142. return ret