word.py 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158
  1. """
  2. 文件名: word.py
  3. 通过检索网络词典获得一个词语的中文翻译
  4. 使用requests生成一个网络请求
  5. 使用bs4解析网页
  6. """
  7. import logging
  8. import requests
  9. from bs4 import BeautifulSoup
  10. from typing import Optional, Dict
  11. class WordDict:
  12. __data_set_search = "english-chinese-simplified"
  13. __url = f"https://dictionary.cambridge.org/zhs/%E6%90%9C%E7%B4%A2/direct/?datasetsearch={__data_set_search}"
  14. __logger = logging.getLogger("dict")
  15. __logger.propagate = False
  16. def __init__(self, user_agent: Optional[str] = None, proxies: Optional[dict] = None):
  17. if user_agent is None:
  18. user_agent = ("Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) "
  19. "Chrome/17.0.963.56 Safari/535.11 ")
  20. if proxies is None:
  21. proxies = {'http': "http://localhost:8889", 'https': "http://localhost:8889"} # 不走系统代理
  22. self.headers = {"User-Agent": user_agent} # 配置请求头
  23. self.proxies = proxies
  24. def set_headers(self, headers: dict):
  25. self.headers.update(headers)
  26. def set_proxies(self, proxies: dict):
  27. self.proxies.update(proxies)
  28. def get_requests(self, q: str) -> "Response":
  29. response = requests.get(self.__url,
  30. params={"q": q},
  31. headers=self.headers,
  32. proxies=self.proxies)
  33. self.__logger.info(f"Get requests params: {q} url: {response.url} status: {response.status_code}")
  34. return Response(response)
  35. class Response:
  36. __logger = logging.getLogger("dict.response")
  37. __logger.propagate = False
  38. def __init__(self, response: requests.Response):
  39. self._res = response
  40. self._soup = None
  41. if self._res.status_code != 200:
  42. self.__logger.debug(f"Response bad status : {self._res.status_code}")
  43. return
  44. self._soup = BeautifulSoup(self._res.text, "html.parser")
  45. self.di_body = self._soup.find(name="div", attrs={"class": "di-body"})
  46. if self.di_body is None:
  47. self._soup = None
  48. self.__logger.debug(f"Response bad syntax url: {self._res.url}")
  49. return
  50. self.entry = [(i, 1) for i in self.di_body.findAll(name="div", attrs={"class": "pr entry-body__el"})]
  51. self.entry += [(i, 2) for i in self.di_body.findAll(name="div", attrs={"class": "pv-block"})]
  52. if len(self.entry) == 0:
  53. self._soup = None
  54. self.__logger.debug(f"Response bad syntax url: {self._res.url}")
  55. return
  56. self.res: Dict[str: Word] = {}
  57. for i, f in self.entry:
  58. name = i.find(name="div", attrs={"class": "di-title"})
  59. if name is None:
  60. continue
  61. if f == 1:
  62. name_string = str(name.span.span.text)
  63. part = i.find(name="div", attrs={"class": "posgram dpos-g hdib lmr-5"})
  64. if part is None:
  65. part_string = "unknown"
  66. else:
  67. part_string = str(part.span.text)
  68. else:
  69. name_string = str(name.h2.b.text)
  70. part = i.find(name="div", attrs={"class": "pos dpos"})
  71. if part is None:
  72. part_string = "unknown"
  73. else:
  74. part_string = str(part.span.text)
  75. word = self.res.get(name_string)
  76. if word is None:
  77. word = Word(name_string)
  78. self.res[name_string] = word
  79. h = i.find(name="div", attrs={"class": "ddef_h"})
  80. if h is None:
  81. continue
  82. english = str(h.div.text)
  83. b = i.find(name="div", attrs={"class": "def-body ddef_b"})
  84. if b is None:
  85. continue
  86. chinese = str(b.span.text)
  87. comment = Word.Comment(part_string, english, chinese)
  88. eg = b.findAll(name="div", attrs={"class": "examp dexamp"})
  89. for e in eg:
  90. es = e.find(name="span", attrs={"class": "eg deg"})
  91. cs = e.find(name="span", attrs={"class": "trans dtrans dtrans-se hdb break-cj"})
  92. if es is None:
  93. continue
  94. es = str(es.text).replace("##", " ").replace("@@", " ")
  95. if cs is not None:
  96. cs = str(cs.text).replace("##", " ").replace("@@", " ")
  97. else:
  98. cs = ""
  99. comment.add_eg(f"{es}##{cs}")
  100. word.add_comment(comment)
  101. if len(self.res) == 0:
  102. self._soup = None
  103. self.__logger.debug(f"Response bad word: {self._res.url}")
  104. @property
  105. def is_find(self):
  106. return self._soup is not None
  107. class Word:
  108. class Comment:
  109. def __init__(self, part: str, english: str, chinese: str):
  110. self.part = part # 词性
  111. self.english = english
  112. self.chinese = chinese
  113. self.eg = []
  114. def add_eg(self, eg: str):
  115. eg = eg.strip()
  116. if eg == "##":
  117. return
  118. self.eg.append(eg)
  119. def __str__(self):
  120. return f"{self.part} {self.english} {self.chinese} \neg: {self.eg}"
  121. def __init__(self, name: str):
  122. self.name = name
  123. self.comment: Dict[str: "Word.Comment"] = {} # 注释
  124. def add_comment(self, c: Comment):
  125. if self.comment.get(c.english) is None:
  126. self.comment[c.english] = c
  127. def __str__(self):
  128. ret = f"{self.name}:\n"
  129. for i in self.comment:
  130. ret += f'note: {self.comment[i]};\n'
  131. return ret