word.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167
  1. """
  2. 文件名: word.py
  3. 通过检索网络词典获得一个词语的中文翻译
  4. 使用requests生成一个网络请求
  5. 使用bs4解析网页
  6. """
  7. import logging
  8. import requests
  9. from bs4 import BeautifulSoup
  10. from typing import Optional, Dict
  11. from core.aliyun import tls
  12. class WordDict:
  13. __data_set_search = "english-chinese-simplified"
  14. __url = f"https://dictionary.cambridge.org/zhs/%E6%90%9C%E7%B4%A2/direct/?datasetsearch={__data_set_search}"
  15. __logger = logging.getLogger("dict")
  16. __logger.propagate = False
  17. def __init__(self, user_agent: Optional[str] = None, proxies: Optional[dict] = None):
  18. if user_agent is None:
  19. user_agent = ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) "
  20. "Chrome/17.0.963.56 Safari/535.11 ")
  21. self.headers = {"User-Agent": user_agent} # 配置请求头
  22. self.proxies = {}
  23. def set_headers(self, headers: dict):
  24. self.headers.update(headers)
  25. def set_proxies(self, proxies: dict):
  26. self.proxies.update(proxies)
  27. def get_requests(self, q: str) -> "Response":
  28. response = requests.get(self.__url,
  29. params={"q": q},
  30. headers=self.headers,
  31. proxies=self.proxies)
  32. self.__logger.info(f"Get requests params: {q} url: {response.url} status: {response.status_code}")
  33. return Response(response)
  34. class Response:
  35. __logger = logging.getLogger("dict.response")
  36. __logger.propagate = False
  37. def __init__(self, response: requests.Response):
  38. self._res = response
  39. self._soup = None
  40. if self._res.status_code != 200:
  41. self.__logger.debug(f"Response bad status : {self._res.status_code}")
  42. return
  43. self._soup = BeautifulSoup(self._res.text, "html.parser")
  44. self.di_body = self._soup.find(name="div", attrs={"class": "di-body"})
  45. if self.di_body is None:
  46. self._soup = None
  47. self.__logger.debug(f"Response bad syntax url: {self._res.url}")
  48. return
  49. self.entry = [(i, 1) for i in self.di_body.findAll(name="div", attrs={"class": "pr entry-body__el"})]
  50. self.entry += [(i, 2) for i in self.di_body.findAll(name="div", attrs={"class": "pv-block"})]
  51. if len(self.entry) == 0:
  52. self._soup = None
  53. self.__logger.debug(f"Response bad syntax url: {self._res.url}")
  54. return
  55. self.res: Dict[str: Word] = {}
  56. for i, f in self.entry:
  57. name = i.find(name="div", attrs={"class": "di-title"})
  58. if name is None:
  59. continue
  60. if f == 1:
  61. name_string = str(name.span.span.text)
  62. part = i.find(name="div", attrs={"class": "posgram dpos-g hdib lmr-5"})
  63. if part is None:
  64. part_string = "unknown"
  65. else:
  66. part_string = str(part.span.text)
  67. else:
  68. name_string = str(name.h2.b.text)
  69. part = i.find(name="div", attrs={"class": "pos dpos"})
  70. if part is None:
  71. part_string = "unknown"
  72. else:
  73. part_string = str(part.span.text)
  74. word = self.res.get(name_string)
  75. if word is None:
  76. tls_res = tls.start(name_string)
  77. if not tls_res.success:
  78. continue
  79. word = Word(name_string, tls_res.mp3)
  80. self.res[name_string] = word
  81. h = i.find(name="div", attrs={"class": "ddef_h"})
  82. if h is None:
  83. continue
  84. english = str(h.div.text)
  85. b = i.find(name="div", attrs={"class": "def-body ddef_b"})
  86. if b is None:
  87. continue
  88. chinese = str(b.span.text)
  89. comment = Word.Comment(part_string, english, chinese)
  90. eg = b.findAll(name="div", attrs={"class": "examp dexamp"})
  91. for e in eg:
  92. es = e.find(name="span", attrs={"class": "eg deg"})
  93. cs = e.find(name="span", attrs={"class": "trans dtrans dtrans-se hdb break-cj"})
  94. if es is None:
  95. continue
  96. es = str(es.text).replace("##", " ").replace("@@", " ")
  97. if cs is not None:
  98. cs = str(cs.text).replace("##", " ").replace("@@", " ")
  99. else:
  100. cs = ""
  101. comment.add_eg(f"{es}##{cs}")
  102. word.add_comment(comment)
  103. if len(self.res) == 0:
  104. self._soup = None
  105. self.__logger.debug(f"Response bad word: {self._res.url}")
  106. @property
  107. def is_find(self):
  108. return self._soup is not None
  109. class Word:
  110. class Comment:
  111. def __init__(self, part: str, english: str, chinese: str):
  112. self.part = part # 词性
  113. self.english = english
  114. self.chinese = chinese
  115. self.eg = []
  116. def add_eg(self, eg: str):
  117. eg = eg.strip()
  118. if eg == "##" or len(eg) == 0: # 中英文使用##拼接
  119. return
  120. self.eg.append(eg)
  121. def __str__(self):
  122. return f"{self.part} {self.english} {self.chinese} \neg: {self.eg}"
  123. def __init__(self, name: str, mp3: str, box: int = 1):
  124. self.name = name
  125. self.comment: Dict[str: "Word.Comment"] = {} # 注释
  126. if box < 1 or box > 5:
  127. box = 1
  128. self.box = box
  129. self.mp3 = mp3
  130. def add_comment(self, c: Comment):
  131. if self.comment.get(c.english) is None:
  132. self.comment[c.english] = c
  133. def set_box(self, box: int):
  134. if 0 < box < 6:
  135. self.box = box
  136. def __str__(self):
  137. ret = f"{self.name}:\n"
  138. for i in self.comment:
  139. ret += f'note: {self.comment[i]};\n'
  140. return ret