lexical.cpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451
  1. /*
  2. * 文件名: lexical
  3. * 目标: aFunlang词法分析
  4. */
  5. #include <cctype>
  6. #include "parser.h"
  7. #include "init.h"
  8. #include "inter.h"
  9. #ifndef isascii
  10. #define isascii(c) (((c) & ~0x7f) == 0)
  11. #endif
  12. #define isignore(ch) (isascii(ch) && (iscntrl(ch) || isspace(ch) || (ch) == ',')) /* 被忽略的符号 */
  13. #define iselement(ch) (!isascii(ch) || isgraph(ch)) /* 可以作为element的符号 */
  14. #define DEL_TOKEN (0)
  15. #define FINISH_TOKEN (-1)
  16. #define CONTINUE_TOKEN (1)
  17. #define ERROR_TOKEN (-2)
  18. #define printLexicalError(info, parser) do { \
  19. writeErrorLog(aFunCoreLogger, "[Lexical] %s:%d %s", (parser)->reader.file, (parser)->reader.line, (info ## Log)); \
  20. printf_stderr(0, "[%s] %s:%d : %s\n", HT_aFunGetText(lexical_n, "Lexical"), (parser)->reader.file, \
  21. (parser)->reader.line, info ## Console); \
  22. (parser)->is_error = true; /* 错误标记在Parser而非Lexical中, Lexical的异常表示lexical停止运行 */ \
  23. } while(0)
  24. #define printLexicalWarning(info, parser) do { \
  25. writeWarningLog(aFunCoreLogger, "[Lexical] %s:%d %s", (parser)->reader.file, (parser)->reader.line, (info ## Log)); \
  26. printf_stderr(0, "[%s] %s:%d : %s\n", HT_aFunGetText(lexical_n, "Lexical"), (parser)->reader.file, \
  27. (parser)->reader.line, info ## Console); \
  28. } while(0)
  29. namespace aFuncore {
  30. void Parser::setLexicalLast(LexicalStatus status, TokenType token) {
  31. lexical.status = status;
  32. lexical.last = reader.countRead();
  33. lexical.token = token;
  34. }
  35. /*
  36. * 函数族: done系列 (doneXXX)
  37. * 目标: 用于把转台xxx转换为其他状态
  38. * 返回值: 1 正常
  39. * 返回值: 0 遇到错误, 仍可继续
  40. * 返回值: -1 正常, 不可继续 -> 必须设置 setLexicalLast
  41. * 返回值: -2 遇到错误, 不可继续
  42. * 注意: 函数使用前不在检查`status`是否正确
  43. */
  44. /*
  45. * 状态机图:
  46. * [lex_begin]
  47. * -> NUL -> (lex_nul)
  48. * -> ALL_PREFIX -> [lex_prefix] # return FINISH_TOKEN
  49. * -> ! -> (lex_prefix_block_p)
  50. * -> @ -> (lex_prefix_block_b)
  51. * -> # -> (lex_prefix_block_c)
  52. * -> ( -> [lex_lp] # return FINISH_TOKEN
  53. * -> [ -> [lex_lb] # return FINISH_TOKEN
  54. * -> { -> [lex_lc] # return FINISH_TOKEN
  55. * -> ) -> [lex_rp] # return FINISH_TOKEN
  56. * -> ] -> [lex_rb] # return FINISH_TOKEN
  57. * -> } -> [lex_rc] # return FINISH_TOKEN
  58. * -> ; -> (lex_comment_before)
  59. * -> isignore(ch) -> [lex_space]
  60. * -> | -> (lex_element_long)
  61. * -> iselement(ch) -> [lex_element]
  62. */
  63. int Parser::doneBegin(char ch) {
  64. if (ch == aFuntool::NUL) {
  65. setLexicalLast(lex_nul, TK_EOF);
  66. return FINISH_TOKEN;
  67. } else if (strchr(Inter::ALL_PREFIX, ch)) { /* 属于前缀 */
  68. setLexicalLast(lex_prefix, TK_PREFIX);
  69. return FINISH_TOKEN;
  70. } else if (strchr("!@#", ch)) {
  71. switch (ch) {
  72. case '!':
  73. lexical.status = lex_prefix_block_p;
  74. return 1;
  75. case '@':
  76. lexical.status = lex_prefix_block_b;
  77. return 1;
  78. case '#':
  79. lexical.status = lex_prefix_block_c;
  80. return 1;
  81. default:
  82. fatalErrorLog(aFunCoreLogger, EXIT_FAILURE, "Switch illegal characters");
  83. return ERROR_TOKEN;
  84. }
  85. } else if (strchr("([{)]}", ch)) { /* 括号 */
  86. switch (ch) {
  87. case '(':
  88. setLexicalLast(lex_lp, TK_LP);
  89. return FINISH_TOKEN;
  90. case '[':
  91. setLexicalLast(lex_lb, TK_LB);
  92. return FINISH_TOKEN;
  93. case '{':
  94. setLexicalLast(lex_lc, TK_LC);
  95. return FINISH_TOKEN;
  96. case ')':
  97. setLexicalLast(lex_rp, TK_RP);
  98. return FINISH_TOKEN;
  99. case ']':
  100. setLexicalLast(lex_rb, TK_RB);
  101. return FINISH_TOKEN;
  102. case '}':
  103. setLexicalLast(lex_rc, TK_RC);
  104. return FINISH_TOKEN;
  105. default:
  106. fatalErrorLog(aFunCoreLogger, EXIT_FAILURE, "Switch illegal characters");
  107. return ERROR_TOKEN;
  108. }
  109. } else if (ch == ';') {
  110. lexical.status = lex_comment_before;
  111. return 1;
  112. } else if (isignore(ch)) { // 空白符或控制字符被忽略
  113. setLexicalLast(lex_space, TK_SPACE);
  114. return 1;
  115. } else if (ch == '|') {
  116. lexical.status = lex_element_long;
  117. return 1;
  118. } else if (iselement(ch)) { // 除空格外的可见字符
  119. setLexicalLast(lex_element_short, TK_ELEMENT_SHORT);
  120. return 1;
  121. }
  122. // TODO-szh 给出警告
  123. return DEL_TOKEN;
  124. }
  125. /*
  126. * 状态机图:
  127. * [lex_prefix_block_p] -> ( -> [lex_lp] # return FINISH_TOKEN
  128. * [lex_prefix_block_b] -> ( -> [lex_lb] # return FINISH_TOKEN
  129. * [lex_prefix_block_c] -> ( -> [lex_lc] # return FINISH_TOKEN
  130. * [lex_prefix_block_p] -> ) -> [lex_rp] # return FINISH_TOKEN
  131. * [lex_prefix_block_b] -> ) -> [lex_rb] # return FINISH_TOKEN
  132. * [lex_prefix_block_c] -> ) -> [lex_rc] # return FINISH_TOKEN
  133. */
  134. int Parser::donePrefixBlock(char ch) {
  135. if (ch == '(') {
  136. switch (lexical.status) {
  137. case lex_prefix_block_p:
  138. setLexicalLast(lex_lp, TK_LP);
  139. return FINISH_TOKEN;
  140. case lex_prefix_block_b:
  141. setLexicalLast(lex_lb, TK_LB);
  142. return FINISH_TOKEN;
  143. case lex_prefix_block_c:
  144. setLexicalLast(lex_lc, TK_LC);
  145. return FINISH_TOKEN;
  146. default:
  147. fatalErrorLog(aFunCoreLogger, EXIT_FAILURE, "Switch illegal characters");
  148. return ERROR_TOKEN;
  149. }
  150. } else if (ch == ')') {
  151. switch (lexical.status) {
  152. case lex_prefix_block_p:
  153. setLexicalLast(lex_rp, TK_RP);
  154. return FINISH_TOKEN;
  155. case lex_prefix_block_b:
  156. setLexicalLast(lex_rb, TK_RB);
  157. return FINISH_TOKEN;
  158. case lex_prefix_block_c:
  159. setLexicalLast(lex_rc, TK_RC);
  160. return FINISH_TOKEN;
  161. default:
  162. fatalErrorLog(aFunCoreLogger, EXIT_FAILURE, "Switch illegal characters");
  163. return ERROR_TOKEN;
  164. }
  165. }
  166. // TODO-szh 给出警告
  167. return DEL_TOKEN;
  168. }
  169. /*
  170. * 状态机图:
  171. * [lex_comment_before]
  172. * -> '\n' || NUL -> [lex_uni_comment_end] # return FINISH_TOKEN
  173. * -> ; -> (lex_mutli_comment) # mutli_comment = 0
  174. * -> other -> (lex_uni_comment)
  175. */
  176. int Parser::doneCommentBefore(char ch) {
  177. if (ch == '\n' || ch == aFuntool::NUL) {
  178. setLexicalLast(lex_uni_comment_end, TK_COMMENT);
  179. return FINISH_TOKEN;
  180. } else if (ch == ';') { // 多行注释
  181. lexical.status = lex_mutli_comment;
  182. lexical.mutli_comment = 0;
  183. return 1;
  184. }
  185. lexical.status = lex_uni_comment;
  186. return 1;
  187. }
  188. /*
  189. * 状态机图:
  190. * [lex_uni_comment]
  191. * -> '\n' || NUL -> [lex_uni_comment_end] # return FINISH_TOKEN
  192. * -> other -> (lex_uni_comment)
  193. */
  194. int Parser::doneUniComment(char ch) {
  195. if (ch == '\n' || ch == aFuntool::NUL) {
  196. setLexicalLast(lex_uni_comment_end, TK_COMMENT);
  197. return FINISH_TOKEN;
  198. }
  199. lexical.status = lex_uni_comment;
  200. return 1;
  201. }
  202. /*
  203. * 状态机图:
  204. * [lex_mutli_comment]
  205. * -> NUL -> [lex_mutli_comment_end] # return FINISH_TOKEN; [warning]
  206. * -> ; -> (lex_mutli_comment_end_before)
  207. * -> other -> (lex_mutli_comment)
  208. */
  209. int Parser::doneMutliComment(char ch) {
  210. if (ch == aFuntool::NUL) {
  211. lexical.status = lex_mutli_comment_end;
  212. // TODO-szh 给出警告
  213. return FINISH_TOKEN;
  214. } else if (ch == ';')
  215. lexical.status = lex_mutli_comment_end_before;
  216. else
  217. lexical.status = lex_mutli_comment;
  218. return 1;
  219. }
  220. /*
  221. * 状态机图:
  222. * [lex_mutli_comment_end_before]
  223. * -> NUL -> [lex_mutli_comment_end] # return FINISH_TOKEN; [warning]
  224. * -> ; -> (lex_mutli_comment) # mutli_comment++;
  225. * -> = ->
  226. * mutli_comment == 0 -> [lex_mutli_comment_end] # return FINISH_TOKEN
  227. * else -> (lex_mutli_comment)# mutli_comment--;
  228. */
  229. int Parser::doneMutliCommentBeforeEnd(char ch) {
  230. if (ch == aFuntool::NUL) {
  231. setLexicalLast(lex_mutli_comment_end, TK_COMMENT);
  232. // TODO-szh 给出警告
  233. return FINISH_TOKEN;
  234. } else if (ch == ';') {
  235. /* 嵌套注释 */
  236. lexical.mutli_comment++;
  237. lexical.status = lex_mutli_comment;
  238. } else if (ch == '=') {
  239. if (lexical.mutli_comment == 0) {
  240. /* 注释结束 */
  241. setLexicalLast(lex_mutli_comment_end, TK_COMMENT);
  242. return FINISH_TOKEN;
  243. } else {
  244. /* 嵌套注释 */
  245. lexical.mutli_comment--;
  246. lexical.status = lex_mutli_comment;
  247. }
  248. }
  249. lexical.status = lex_mutli_comment;
  250. return 1;
  251. }
  252. /*
  253. * 状态机图:
  254. * [lex_element_long]
  255. * -> NUL -> error
  256. * -> | -> [lex_element_long_end]
  257. * -> other -> (lex_element_long)
  258. */
  259. int Parser::doneElementLong(char ch) {
  260. if (ch == '|') { // 结束符
  261. setLexicalLast(lex_element_long_end, TK_ELEMENT_LONG);
  262. return 1;
  263. } else if (ch == aFuntool::NUL) {
  264. // TODO-szh 添加警告
  265. return ERROR_TOKEN;
  266. }
  267. lexical.status = lex_element_long;
  268. return 1;
  269. }
  270. /*
  271. * 状态机图:
  272. * [lex_element_long]
  273. * -> | -> (lex_element_long)
  274. * -> other -> [lex_element_long_end] # return FINISH_TOKEN
  275. */
  276. int Parser::doneElementLongEnd(char ch) {
  277. if (ch == '|') { // ||表示非结束
  278. lexical.status = lex_element_long;
  279. return 1;
  280. }
  281. lexical.status = lex_element_long_end;
  282. return FINISH_TOKEN;
  283. }
  284. /*
  285. * 状态机图:
  286. * [lex_element_short]
  287. * -> !strchr("!@#([{}]);,", ch) && iselement(ch) -> (lex_element_short)
  288. * -> other -> (lex_element_short) # return FINISH_TOKEN
  289. */
  290. int Parser::doneElementShort(char ch) {
  291. if (!strchr("!@#([{}]);,", ch) && iselement(ch)) { // 除空格外的可见字符 (不包括NUL)
  292. setLexicalLast(lex_element_short, TK_ELEMENT_SHORT);
  293. return 1;
  294. }
  295. lexical.status = lex_element_short;
  296. return FINISH_TOKEN;
  297. }
  298. /*
  299. * 状态机图:
  300. * [lex_space]
  301. * -> ch != NUL && isignore(ch) -> (lex_space)
  302. * -> other -> (lex_space) # return FINISH_TOKEN
  303. */
  304. int Parser::doneSpace(char ch) {
  305. if (ch != aFuntool::NUL && isignore(ch)) {
  306. setLexicalLast(lex_space, TK_SPACE);
  307. return 1;
  308. }
  309. lexical.status = lex_space;
  310. return FINISH_TOKEN;
  311. }
  312. /*
  313. * 函数名: getTokenFromLexical
  314. * 目标: 获取Lexical的TokenType以及相关值
  315. */
  316. Parser::TokenType Parser::getTokenFromLexical(std::string &text) {
  317. Parser::TokenType tt;
  318. int re;
  319. lexical.status = lex_begin;
  320. lexical.last = 0;
  321. text = "";
  322. if (lexical.is_end)
  323. return TK_EOF;
  324. else if (lexical.is_error || reader.isError()) /* lexical和reader出现异常后不再执行 */
  325. return TK_ERROR;
  326. while (true) {
  327. char ch = reader.getChar();
  328. if (reader.isError())
  329. return TK_ERROR;
  330. if (isascii(ch) && iscntrl(ch) && !isspace(ch) && ch != aFuntool::NUL) // ascii 控制字符
  331. NULL; // TODO-szh 给出警告
  332. switch (lexical.status) {
  333. case lex_begin:
  334. re = doneBegin(ch);
  335. break;
  336. case lex_prefix_block_p:
  337. case lex_prefix_block_b:
  338. case lex_prefix_block_c:
  339. re = donePrefixBlock(ch);
  340. break;
  341. case lex_comment_before:
  342. re = doneCommentBefore(ch);
  343. break;
  344. case lex_element_long:
  345. re = doneElementLong(ch);
  346. break;
  347. case lex_mutli_comment:
  348. re = doneMutliComment(ch);
  349. break;
  350. case lex_uni_comment:
  351. re = doneUniComment(ch);
  352. break;
  353. case lex_mutli_comment_end_before:
  354. re = doneMutliCommentBeforeEnd(ch);
  355. break;
  356. case lex_space:
  357. re = doneSpace(ch);
  358. break;
  359. case lex_element_short:
  360. re = doneElementShort(ch);
  361. break;
  362. case lex_element_long_end:
  363. re = doneElementLongEnd(ch);
  364. break;
  365. default:
  366. fatalErrorLog(aFunCoreLogger, EXIT_FAILURE, "Switch illegal characters");
  367. re = ERROR_TOKEN;
  368. break;
  369. }
  370. if (re == ERROR_TOKEN) {
  371. tt = TK_ERROR;
  372. break;
  373. } else if (re == DEL_TOKEN) { // 删除该token, 继续执行
  374. char *word = reader.readWord(lexical.last);
  375. free(word);
  376. lexical.status = lex_begin;
  377. lexical.last = 0;
  378. continue;
  379. } else if (re == FINISH_TOKEN) {
  380. char *word = reader.readWord(lexical.last);
  381. if (word == nullptr) {
  382. tt = TK_ERROR;
  383. break;
  384. }
  385. tt = lexical.token;
  386. if (tt == TK_ELEMENT_SHORT || tt == TK_PREFIX) {
  387. text = word;
  388. aFuntool::safeFree(word);
  389. } else if (tt == TK_ELEMENT_LONG) {
  390. char *new_str = aFuntool::safeCalloc<char>(strlen(word) - 2 + 1); // 去除收尾|
  391. bool flat = false;
  392. char *p = word + 1;
  393. size_t count = 0;
  394. for (NULL; *p != aFuntool::NUL; p++) {
  395. if (*p == '|' && !flat) { // 跳过第一个 `|`, 如果是末尾|则自然跳过, 若不是则在遇到第二个`|`时写入数据
  396. flat = true; /* count不需要递增 */
  397. continue;
  398. } else if (*p != '|' && flat) // 遇到错误
  399. break;
  400. else
  401. flat = false;
  402. new_str[count] = *p;
  403. count++;
  404. }
  405. text = new_str;
  406. free(word);
  407. free(new_str);
  408. } else
  409. free(word);
  410. if (tt == TK_SPACE || tt == TK_COMMENT) {
  411. lexical.status = lex_begin;
  412. lexical.last = 0;
  413. continue;
  414. } else if (tt == TK_EOF)
  415. lexical.is_end = true;
  416. break;
  417. }
  418. }
  419. return tt;
  420. }
  421. }