lexical.cpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440
  1. /*
  2. * 文件名: lexical
  3. * 目标: aFunlang词法分析
  4. */
  5. #include <cctype>
  6. #include "core-parser.h"
  7. #include "core-init.h"
  8. #include "inter.h"
  9. #ifndef isascii
  10. #define isascii(c) (((c) & ~0x7f) == 0)
  11. #endif
  12. #define isignore(ch) (isascii(ch) && (iscntrl(ch) || isspace(ch) || (ch) == ',')) /* 被忽略的符号 */
  13. #define iselement(ch) (!isascii(ch) || isgraph(ch)) /* 可以作为element的符号 */
  14. namespace aFuncore {
  15. void Parser::setLexicalLast(LexicalStatus status, TokenType token) {
  16. lexical.status = status;
  17. lexical.last = reader.countRead();
  18. lexical.token = token;
  19. }
  20. /*
  21. * 函数族: done系列 (doneXXX)
  22. * 目标: 用于把转台xxx转换为其他状态
  23. * 返回值: 1 正常
  24. * 返回值: 0 遇到错误, 仍可继续
  25. * 返回值: -1 正常, 不可继续 -> 必须设置 setLexicalLast
  26. * 返回值: -2 遇到错误, 不可继续
  27. * 注意: 函数使用前不在检查`status`是否正确
  28. */
  29. /*
  30. * 状态机图:
  31. * [lex_begin]
  32. * -> NUL -> (lex_nul)
  33. * -> ALL_PREFIX -> [lex_prefix] # return FINISH_TOKEN
  34. * -> ! -> (lex_prefix_block_p)
  35. * -> @ -> (lex_prefix_block_b)
  36. * -> # -> (lex_prefix_block_c)
  37. * -> ( -> [lex_lp] # return FINISH_TOKEN
  38. * -> [ -> [lex_lb] # return FINISH_TOKEN
  39. * -> { -> [lex_lc] # return FINISH_TOKEN
  40. * -> ) -> [lex_rp] # return FINISH_TOKEN
  41. * -> ] -> [lex_rb] # return FINISH_TOKEN
  42. * -> } -> [lex_rc] # return FINISH_TOKEN
  43. * -> ; -> (lex_comment_before)
  44. * -> isignore(ch) -> [lex_space]
  45. * -> | -> (lex_element_long)
  46. * -> iselement(ch) -> [lex_element]
  47. */
  48. Parser::DoneStatus Parser::doneBegin(char ch) {
  49. if (ch == aFuntool::NUL) {
  50. setLexicalLast(lex_nul, TK_EOF);
  51. return FINISH_TOKEN;
  52. } else if (strchr(Inter::ALL_PREFIX, ch)) { /* 属于前缀 */
  53. setLexicalLast(lex_prefix, TK_PREFIX);
  54. return FINISH_TOKEN;
  55. } else if (strchr("!@#", ch)) {
  56. switch (ch) {
  57. case '!':
  58. lexical.status = lex_prefix_block_p;
  59. return CONTINUE_TOKEN;
  60. case '@':
  61. lexical.status = lex_prefix_block_b;
  62. return CONTINUE_TOKEN;
  63. case '#':
  64. lexical.status = lex_prefix_block_c;
  65. return CONTINUE_TOKEN;
  66. default:
  67. fatalErrorLog(aFunCoreLogger, EXIT_FAILURE, "Switch illegal characters");
  68. pushEvent({ParserEvent::parser_error_unknown, reader.getFileLine(), ""});
  69. return ERROR_TOKEN;
  70. }
  71. } else if (strchr("([{)]}", ch)) { /* 括号 */
  72. switch (ch) {
  73. case '(':
  74. setLexicalLast(lex_lp, TK_LP);
  75. return FINISH_TOKEN;
  76. case '[':
  77. setLexicalLast(lex_lb, TK_LB);
  78. return FINISH_TOKEN;
  79. case '{':
  80. setLexicalLast(lex_lc, TK_LC);
  81. return FINISH_TOKEN;
  82. case ')':
  83. setLexicalLast(lex_rp, TK_RP);
  84. return FINISH_TOKEN;
  85. case ']':
  86. setLexicalLast(lex_rb, TK_RB);
  87. return FINISH_TOKEN;
  88. case '}':
  89. setLexicalLast(lex_rc, TK_RC);
  90. return FINISH_TOKEN;
  91. default:
  92. fatalErrorLog(aFunCoreLogger, EXIT_FAILURE, "Switch illegal characters");
  93. pushEvent({ParserEvent::parser_error_unknown, reader.getFileLine(), ""});
  94. return ERROR_TOKEN;
  95. }
  96. } else if (ch == ';') {
  97. lexical.status = lex_comment_before;
  98. return CONTINUE_TOKEN;
  99. } else if (isignore(ch)) { // 空白符或控制字符被忽略
  100. setLexicalLast(lex_space, TK_SPACE);
  101. return CONTINUE_TOKEN;
  102. } else if (ch == '|') {
  103. lexical.status = lex_element_long;
  104. return CONTINUE_TOKEN;
  105. } else if (iselement(ch)) { // 除空格外的可见字符
  106. setLexicalLast(lex_element_short, TK_ELEMENT_SHORT);
  107. return CONTINUE_TOKEN;
  108. }
  109. pushEvent({ParserEvent::lexical_error_char, reader.getFileLine(), ""});
  110. return DEL_TOKEN;
  111. }
  112. /*
  113. * 状态机图:
  114. * [lex_prefix_block_p] -> ( -> [lex_lp] # return FINISH_TOKEN
  115. * [lex_prefix_block_b] -> ( -> [lex_lb] # return FINISH_TOKEN
  116. * [lex_prefix_block_c] -> ( -> [lex_lc] # return FINISH_TOKEN
  117. * [lex_prefix_block_p] -> ) -> [lex_rp] # return FINISH_TOKEN
  118. * [lex_prefix_block_b] -> ) -> [lex_rb] # return FINISH_TOKEN
  119. * [lex_prefix_block_c] -> ) -> [lex_rc] # return FINISH_TOKEN
  120. */
  121. Parser::DoneStatus Parser::donePrefixBlock(char ch) {
  122. if (ch == '(') {
  123. switch (lexical.status) {
  124. case lex_prefix_block_p:
  125. setLexicalLast(lex_lp, TK_LP);
  126. return FINISH_TOKEN;
  127. case lex_prefix_block_b:
  128. setLexicalLast(lex_lb, TK_LB);
  129. return FINISH_TOKEN;
  130. case lex_prefix_block_c:
  131. setLexicalLast(lex_lc, TK_LC);
  132. return FINISH_TOKEN;
  133. default:
  134. fatalErrorLog(aFunCoreLogger, EXIT_FAILURE, "Switch illegal characters");
  135. pushEvent({ParserEvent::parser_error_unknown, reader.getFileLine(), ""});
  136. return ERROR_TOKEN;
  137. }
  138. } else if (ch == ')') {
  139. switch (lexical.status) {
  140. case lex_prefix_block_p:
  141. setLexicalLast(lex_rp, TK_RP);
  142. return FINISH_TOKEN;
  143. case lex_prefix_block_b:
  144. setLexicalLast(lex_rb, TK_RB);
  145. return FINISH_TOKEN;
  146. case lex_prefix_block_c:
  147. setLexicalLast(lex_rc, TK_RC);
  148. return FINISH_TOKEN;
  149. default:
  150. fatalErrorLog(aFunCoreLogger, EXIT_FAILURE, "Switch illegal characters");
  151. pushEvent({ParserEvent::parser_error_unknown, reader.getFileLine(), ""});
  152. return ERROR_TOKEN;
  153. }
  154. }
  155. pushEvent({ParserEvent::lexical_error_char, reader.getFileLine(), ""});
  156. return DEL_TOKEN;
  157. }
  158. /*
  159. * 状态机图:
  160. * [lex_comment_before]
  161. * -> '\n' || NUL -> [lex_uni_comment_end] # return FINISH_TOKEN
  162. * -> ; -> (lex_mutli_comment) # mutli_comment = 0
  163. * -> other -> (lex_uni_comment)
  164. */
  165. Parser::DoneStatus Parser::doneCommentBefore(char ch) {
  166. if (ch == '\n' || ch == aFuntool::NUL) {
  167. setLexicalLast(lex_uni_comment_end, TK_COMMENT);
  168. return FINISH_TOKEN;
  169. } else if (ch == ';') { // 多行注释
  170. lexical.status = lex_mutli_comment;
  171. lexical.mutli_comment = 0;
  172. return CONTINUE_TOKEN;
  173. }
  174. lexical.status = lex_uni_comment;
  175. return CONTINUE_TOKEN;
  176. }
  177. /*
  178. * 状态机图:
  179. * [lex_uni_comment]
  180. * -> '\n' || NUL -> [lex_uni_comment_end] # return FINISH_TOKEN
  181. * -> other -> (lex_uni_comment)
  182. */
  183. Parser::DoneStatus Parser::doneUniComment(char ch) {
  184. if (ch == '\n' || ch == aFuntool::NUL) {
  185. setLexicalLast(lex_uni_comment_end, TK_COMMENT);
  186. return FINISH_TOKEN;
  187. }
  188. lexical.status = lex_uni_comment;
  189. return CONTINUE_TOKEN;
  190. }
  191. /*
  192. * 状态机图:
  193. * [lex_mutli_comment]
  194. * -> NUL -> [lex_mutli_comment_end] # return FINISH_TOKEN; [warning]
  195. * -> ; -> (lex_mutli_comment_end_before)
  196. * -> other -> (lex_mutli_comment)
  197. */
  198. Parser::DoneStatus Parser::doneMutliComment(char ch) {
  199. if (ch == aFuntool::NUL) {
  200. lexical.status = lex_mutli_comment_end;
  201. pushEvent({ParserEvent::lexical_warning_comment_end, reader.getFileLine(), ""});
  202. return FINISH_TOKEN;
  203. } else if (ch == ';')
  204. lexical.status = lex_mutli_comment_end_before;
  205. else
  206. lexical.status = lex_mutli_comment;
  207. return CONTINUE_TOKEN;
  208. }
  209. /*
  210. * 状态机图:
  211. * [lex_mutli_comment_end_before]
  212. * -> NUL -> [lex_mutli_comment_end] # return FINISH_TOKEN; [warning]
  213. * -> ; -> (lex_mutli_comment) # mutli_comment++;
  214. * -> = ->
  215. * mutli_comment == 0 -> [lex_mutli_comment_end] # return FINISH_TOKEN
  216. * else -> (lex_mutli_comment)# mutli_comment--;
  217. */
  218. Parser::DoneStatus Parser::doneMutliCommentBeforeEnd(char ch) {
  219. if (ch == aFuntool::NUL) {
  220. setLexicalLast(lex_mutli_comment_end, TK_COMMENT);
  221. pushEvent({ParserEvent::lexical_warning_comment_end, reader.getFileLine(), ""});
  222. return FINISH_TOKEN;
  223. } else if (ch == ';') {
  224. /* 嵌套注释 */
  225. lexical.mutli_comment++;
  226. lexical.status = lex_mutli_comment;
  227. } else if (ch == '=') {
  228. if (lexical.mutli_comment == 0) {
  229. /* 注释结束 */
  230. setLexicalLast(lex_mutli_comment_end, TK_COMMENT);
  231. return FINISH_TOKEN;
  232. } else {
  233. /* 嵌套注释 */
  234. lexical.mutli_comment--;
  235. lexical.status = lex_mutli_comment;
  236. }
  237. }
  238. lexical.status = lex_mutli_comment;
  239. return CONTINUE_TOKEN;
  240. }
  241. /*
  242. * 状态机图:
  243. * [lex_element_long]
  244. * -> NUL -> error
  245. * -> | -> [lex_element_long_end]
  246. * -> other -> (lex_element_long)
  247. */
  248. Parser::DoneStatus Parser::doneElementLong(char ch) {
  249. if (ch == '|') { // 结束符
  250. setLexicalLast(lex_element_long_end, TK_ELEMENT_LONG);
  251. return CONTINUE_TOKEN;
  252. } else if (ch == aFuntool::NUL) {
  253. pushEvent({ParserEvent::lexical_error_element_end, reader.getFileLine(), ""});
  254. return ERROR_TOKEN;
  255. }
  256. lexical.status = lex_element_long;
  257. return CONTINUE_TOKEN;
  258. }
  259. /*
  260. * 状态机图:
  261. * [lex_element_long]
  262. * -> | -> (lex_element_long)
  263. * -> other -> [lex_element_long_end] # return FINISH_TOKEN
  264. */
  265. Parser::DoneStatus Parser::doneElementLongEnd(char ch) {
  266. if (ch == '|') { // ||表示非结束
  267. lexical.status = lex_element_long;
  268. return CONTINUE_TOKEN;
  269. }
  270. lexical.status = lex_element_long_end;
  271. return FINISH_TOKEN;
  272. }
  273. /*
  274. * 状态机图:
  275. * [lex_element_short]
  276. * -> !strchr("!@#([{}]);,", ch) && iselement(ch) -> (lex_element_short)
  277. * -> other -> (lex_element_short) # return FINISH_TOKEN
  278. */
  279. Parser::DoneStatus Parser::doneElementShort(char ch) {
  280. if (!strchr("!@#([{}]);,", ch) && iselement(ch)) { // 除空格外的可见字符 (不包括NUL)
  281. setLexicalLast(lex_element_short, TK_ELEMENT_SHORT);
  282. return CONTINUE_TOKEN;
  283. }
  284. lexical.status = lex_element_short;
  285. return FINISH_TOKEN;
  286. }
  287. /*
  288. * 状态机图:
  289. * [lex_space]
  290. * -> ch != NUL && isignore(ch) -> (lex_space)
  291. * -> other -> (lex_space) # return FINISH_TOKEN
  292. */
  293. Parser::DoneStatus Parser::doneSpace(char ch) {
  294. if (ch != aFuntool::NUL && isignore(ch)) {
  295. setLexicalLast(lex_space, TK_SPACE);
  296. return CONTINUE_TOKEN;
  297. }
  298. lexical.status = lex_space;
  299. return FINISH_TOKEN;
  300. }
  301. /*
  302. * 函数名: getTokenFromLexical
  303. * 目标: 获取Lexical的TokenType以及相关值
  304. */
  305. Parser::TokenType Parser::getTokenFromLexical(std::string &text) {
  306. TokenType tt;
  307. DoneStatus re;
  308. lexical.status = lex_begin;
  309. lexical.last = 0;
  310. text = "";
  311. if (lexical.is_end)
  312. return TK_EOF;
  313. else if (lexical.is_error || reader.isError()) /* lexical和reader出现异常后不再执行 */
  314. return TK_ERROR;
  315. while (true) {
  316. char ch = reader.getChar();
  317. if (reader.isError()) {
  318. pushEvent({ParserEvent::reader_error, reader.getFileLine(), ""});
  319. return TK_ERROR;
  320. }
  321. if (isascii(ch) && iscntrl(ch) && !isspace(ch) && ch != aFuntool::NUL) // ascii 控制字符
  322. pushEvent({ParserEvent::lexical_error_char, reader.getFileLine(), ""});
  323. switch (lexical.status) {
  324. case lex_begin:
  325. re = doneBegin(ch);
  326. break;
  327. case lex_prefix_block_p:
  328. case lex_prefix_block_b:
  329. case lex_prefix_block_c:
  330. re = donePrefixBlock(ch);
  331. break;
  332. case lex_comment_before:
  333. re = doneCommentBefore(ch);
  334. break;
  335. case lex_element_long:
  336. re = doneElementLong(ch);
  337. break;
  338. case lex_mutli_comment:
  339. re = doneMutliComment(ch);
  340. break;
  341. case lex_uni_comment:
  342. re = doneUniComment(ch);
  343. break;
  344. case lex_mutli_comment_end_before:
  345. re = doneMutliCommentBeforeEnd(ch);
  346. break;
  347. case lex_space:
  348. re = doneSpace(ch);
  349. break;
  350. case lex_element_short:
  351. re = doneElementShort(ch);
  352. break;
  353. case lex_element_long_end:
  354. re = doneElementLongEnd(ch);
  355. break;
  356. default:
  357. fatalErrorLog(aFunCoreLogger, EXIT_FAILURE, "Switch illegal characters");
  358. re = ERROR_TOKEN;
  359. break;
  360. }
  361. if (re == ERROR_TOKEN) {
  362. tt = TK_ERROR;
  363. lexical.is_error = true;
  364. break;
  365. } else if (re == DEL_TOKEN) { // 删除该token, 继续执行
  366. char *word = reader.readWord(lexical.last);
  367. aFuntool::safeFree(word);
  368. lexical.status = lex_begin;
  369. lexical.last = 0;
  370. continue;
  371. } else if (re == FINISH_TOKEN) {
  372. char *word = reader.readWord(lexical.last);
  373. if (word == nullptr) {
  374. tt = TK_ERROR;
  375. break;
  376. }
  377. tt = lexical.token;
  378. if (tt == TK_ELEMENT_SHORT || tt == TK_PREFIX) {
  379. text = word;
  380. aFuntool::safeFree(word);
  381. } else if (tt == TK_ELEMENT_LONG) {
  382. char *new_str = aFuntool::safeCalloc<char>(strlen(word) - 2 + 1); // 去除收尾|
  383. bool flat = false;
  384. char *p = word + 1;
  385. size_t count = 0;
  386. for (NULL; *p != aFuntool::NUL; p++) {
  387. if (*p == '|' && !flat) { // 跳过第一个 `|`, 如果是末尾|则自然跳过, 若不是则在遇到第二个`|`时写入数据
  388. flat = true; /* count不需要递增 */
  389. continue;
  390. } else if (*p != '|' && flat) // 遇到错误
  391. break;
  392. else
  393. flat = false;
  394. new_str[count] = *p;
  395. count++;
  396. }
  397. text = new_str;
  398. aFuntool::safeFree(word);
  399. aFuntool::safeFree(new_str);
  400. } else
  401. aFuntool::safeFree(word);
  402. if (tt == TK_SPACE || tt == TK_COMMENT) {
  403. lexical.status = lex_begin;
  404. lexical.last = 0;
  405. continue;
  406. } else if (tt == TK_EOF)
  407. lexical.is_end = true;
  408. break;
  409. }
  410. }
  411. return tt;
  412. }
  413. }