SongZihuan
/
aFunlang
cermin dari https://github.com/aFun-org/aFunlang.git


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451
							/*
 * 文件名: lexical
 * 目标: aFunlang词法分析
 */
#include <cctype>
#include "parser.h"
#include "init.h"
#include "inter.h"

#ifndef isascii
#define isascii(c) (((c) & ~0x7f) == 0)
#endif

#define isignore(ch) (isascii(ch) && (iscntrl(ch) || isspace(ch) || (ch) == ','))  /* 被忽略的符号 */
#define iselement(ch) (!isascii(ch) || isgraph(ch))  /* 可以作为element的符号 */

#define DEL_TOKEN (0)
#define FINISH_TOKEN (-1)
#define CONTINUE_TOKEN (1)
#define ERROR_TOKEN (-2)

#define printLexicalError(info, parser) do { \
    writeErrorLog(aFunCoreLogger, "[Lexical] %s:%d %s", (parser)->reader.file, (parser)->reader.line, (info ## Log)); \
    printf_stderr(0, "[%s] %s:%d : %s\n", HT_aFunGetText(lexical_n, "Lexical"), (parser)->reader.file, \
                  (parser)->reader.line, info ## Console); \
    (parser)->is_error = true; /* 错误标记在Parser而非Lexical中, Lexical的异常表示lexical停止运行 */ \
} while(0)

#define printLexicalWarning(info, parser) do { \
    writeWarningLog(aFunCoreLogger, "[Lexical] %s:%d %s", (parser)->reader.file, (parser)->reader.line, (info ## Log)); \
    printf_stderr(0, "[%s] %s:%d : %s\n", HT_aFunGetText(lexical_n, "Lexical"), (parser)->reader.file, \
                  (parser)->reader.line, info ## Console); \
} while(0)

namespace aFuncore {
    void Parser::setLexicalLast(LexicalStatus status, TokenType token) {
        lexical.status = status;
        lexical.last = reader.countRead();
        lexical.token = token;
    }

    /*
     * 函数族: done系列 (doneXXX)
     * 目标: 用于把转台xxx转换为其他状态
     * 返回值: 1 正常
     * 返回值: 0 遇到错误, 仍可继续
     * 返回值: -1 正常, 不可继续 -> 必须设置 setLexicalLast
     * 返回值: -2 遇到错误, 不可继续
     * 注意: 函数使用前不在检查`status`是否正确
     */

    /*
     * 状态机图:
     * [lex_begin]
     *     -> NUL -> (lex_nul)
     *     -> ALL_PREFIX -> [lex_prefix] # return FINISH_TOKEN
     *     -> ! -> (lex_prefix_block_p)
     *     -> @ -> (lex_prefix_block_b)
     *     -> # -> (lex_prefix_block_c)
     *     -> ( -> [lex_lp] # return FINISH_TOKEN
     *     -> [ -> [lex_lb] # return FINISH_TOKEN
     *     -> { -> [lex_lc] # return FINISH_TOKEN
     *     -> ) -> [lex_rp] # return FINISH_TOKEN
     *     -> ] -> [lex_rb] # return FINISH_TOKEN
     *     -> } -> [lex_rc] # return FINISH_TOKEN
     *     -> ; -> (lex_comment_before)
     *     -> isignore(ch) -> [lex_space]
     *     -> | -> (lex_element_long)
     *     -> iselement(ch) -> [lex_element]
     */
    int Parser::doneBegin(char ch) {
        if (ch == aFuntool::NUL) {
            setLexicalLast(lex_nul, TK_EOF);
            return FINISH_TOKEN;
        } else if (strchr(Inter::ALL_PREFIX, ch)) {  /* 属于前缀 */
            setLexicalLast(lex_prefix, TK_PREFIX);
            return FINISH_TOKEN;
        } else if (strchr("!@#", ch)) {
            switch (ch) {
                case '!':
                    lexical.status = lex_prefix_block_p;
                    return 1;
                case '@':
                    lexical.status = lex_prefix_block_b;
                    return 1;
                case '#':
                    lexical.status = lex_prefix_block_c;
                    return 1;
                default:
                    fatalErrorLog(aFunCoreLogger, EXIT_FAILURE, "Switch illegal characters");
                    return ERROR_TOKEN;
            }
        } else if (strchr("([{)]}", ch)) { /* 括号 */
            switch (ch) {
                case '(':
                    setLexicalLast(lex_lp, TK_LP);
                    return FINISH_TOKEN;
                case '[':
                    setLexicalLast(lex_lb, TK_LB);
                    return FINISH_TOKEN;
                case '{':
                    setLexicalLast(lex_lc, TK_LC);
                    return FINISH_TOKEN;
                case ')':
                    setLexicalLast(lex_rp, TK_RP);
                    return FINISH_TOKEN;
                case ']':
                    setLexicalLast(lex_rb, TK_RB);
                    return FINISH_TOKEN;
                case '}':
                    setLexicalLast(lex_rc, TK_RC);
                    return FINISH_TOKEN;
                default:
                    fatalErrorLog(aFunCoreLogger, EXIT_FAILURE, "Switch illegal characters");
                    return ERROR_TOKEN;
            }
        } else if (ch == ';') {
            lexical.status = lex_comment_before;
            return 1;
        } else if (isignore(ch)) {  // 空白符或控制字符被忽略
            setLexicalLast(lex_space, TK_SPACE);
            return 1;
        } else if (ch == '|') {
            lexical.status = lex_element_long;
            return 1;
        } else if (iselement(ch)) {  // 除空格外的可见字符
            setLexicalLast(lex_element_short, TK_ELEMENT_SHORT);
            return 1;
        }
        // TODO-szh 给出警告
        return DEL_TOKEN;
    }

    /*
     * 状态机图:
     * [lex_prefix_block_p] -> ( -> [lex_lp] # return FINISH_TOKEN
     * [lex_prefix_block_b] -> ( -> [lex_lb] # return FINISH_TOKEN
     * [lex_prefix_block_c] -> ( -> [lex_lc] # return FINISH_TOKEN
     * [lex_prefix_block_p] -> ) -> [lex_rp] # return FINISH_TOKEN
     * [lex_prefix_block_b] -> ) -> [lex_rb] # return FINISH_TOKEN
     * [lex_prefix_block_c] -> ) -> [lex_rc] # return FINISH_TOKEN
     */
    int Parser::donePrefixBlock(char ch) {
        if (ch == '(') {
            switch (lexical.status) {
                case lex_prefix_block_p:
                    setLexicalLast(lex_lp, TK_LP);
                    return FINISH_TOKEN;
                case lex_prefix_block_b:
                    setLexicalLast(lex_lb, TK_LB);
                    return FINISH_TOKEN;
                case lex_prefix_block_c:
                    setLexicalLast(lex_lc, TK_LC);
                    return FINISH_TOKEN;
                default:
                    fatalErrorLog(aFunCoreLogger, EXIT_FAILURE, "Switch illegal characters");
                    return ERROR_TOKEN;
            }
        } else if (ch == ')') {
            switch (lexical.status) {
                case lex_prefix_block_p:
                    setLexicalLast(lex_rp, TK_RP);
                    return FINISH_TOKEN;
                case lex_prefix_block_b:
                    setLexicalLast(lex_rb, TK_RB);
                    return FINISH_TOKEN;
                case lex_prefix_block_c:
                    setLexicalLast(lex_rc, TK_RC);
                    return FINISH_TOKEN;
                default:
                    fatalErrorLog(aFunCoreLogger, EXIT_FAILURE, "Switch illegal characters");
                    return ERROR_TOKEN;
            }
        }
        // TODO-szh 给出警告
        return DEL_TOKEN;
    }

    /*
     * 状态机图:
     * [lex_comment_before]
     *      -> '\n' || NUL -> [lex_uni_comment_end] # return FINISH_TOKEN
     *      -> ; -> (lex_mutli_comment) # mutli_comment = 0
     *      -> other -> (lex_uni_comment)
     */
    int Parser::doneCommentBefore(char ch) {
        if (ch == '\n' || ch == aFuntool::NUL) {
            setLexicalLast(lex_uni_comment_end, TK_COMMENT);
            return FINISH_TOKEN;
        } else if (ch == ';') {  // 多行注释
            lexical.status = lex_mutli_comment;
            lexical.mutli_comment = 0;
            return 1;
        }
        lexical.status = lex_uni_comment;
        return 1;
    }

    /*
     * 状态机图:
     * [lex_uni_comment]
     *      -> '\n' || NUL -> [lex_uni_comment_end] # return FINISH_TOKEN
     *      -> other -> (lex_uni_comment)
     */
    int Parser::doneUniComment(char ch) {
        if (ch == '\n' || ch == aFuntool::NUL) {
            setLexicalLast(lex_uni_comment_end, TK_COMMENT);
            return FINISH_TOKEN;
        }
        lexical.status = lex_uni_comment;
        return 1;
    }

    /*
     * 状态机图:
     * [lex_mutli_comment]
     *      -> NUL -> [lex_mutli_comment_end] # return FINISH_TOKEN; [warning]
     *      -> ; -> (lex_mutli_comment_end_before)
     *      -> other -> (lex_mutli_comment)
     */
    int Parser::doneMutliComment(char ch) {
        if (ch == aFuntool::NUL) {
            lexical.status = lex_mutli_comment_end;
            // TODO-szh 给出警告
            return FINISH_TOKEN;
        } else if (ch == ';')
            lexical.status = lex_mutli_comment_end_before;
        else
            lexical.status = lex_mutli_comment;
        return 1;
    }

    /*
     * 状态机图:
     * [lex_mutli_comment_end_before]
     *      -> NUL -> [lex_mutli_comment_end] # return FINISH_TOKEN; [warning]
     *      -> ; -> (lex_mutli_comment) # mutli_comment++;
     *      -> = ->
     *              mutli_comment == 0 -> [lex_mutli_comment_end] # return FINISH_TOKEN
     *              else -> (lex_mutli_comment)# mutli_comment--;
     */
    int Parser::doneMutliCommentBeforeEnd(char ch) {
        if (ch == aFuntool::NUL) {
            setLexicalLast(lex_mutli_comment_end, TK_COMMENT);
            // TODO-szh 给出警告
            return FINISH_TOKEN;
        } else if (ch == ';') {
            /* 嵌套注释 */
            lexical.mutli_comment++;
            lexical.status = lex_mutli_comment;
        } else if (ch == '=') {
            if (lexical.mutli_comment == 0) {
                /* 注释结束 */
                setLexicalLast(lex_mutli_comment_end, TK_COMMENT);
                return FINISH_TOKEN;
            } else {
                /* 嵌套注释 */
                lexical.mutli_comment--;
                lexical.status = lex_mutli_comment;
            }
        }
        lexical.status = lex_mutli_comment;
        return 1;
    }

    /*
     * 状态机图:
     * [lex_element_long]
     *      -> NUL -> error
     *      -> | -> [lex_element_long_end]
     *      -> other -> (lex_element_long)
     */
    int Parser::doneElementLong(char ch) {
        if (ch == '|') {  // 结束符
            setLexicalLast(lex_element_long_end, TK_ELEMENT_LONG);
            return 1;
        } else if (ch == aFuntool::NUL) {
            // TODO-szh 添加警告
            return ERROR_TOKEN;
        }
        lexical.status = lex_element_long;
        return 1;
    }

    /*
     * 状态机图:
     * [lex_element_long]
     *      -> | -> (lex_element_long)
     *      -> other -> [lex_element_long_end] # return FINISH_TOKEN
     */
    int Parser::doneElementLongEnd(char ch) {
        if (ch == '|') {  // ||表示非结束
            lexical.status = lex_element_long;
            return 1;
        }
        lexical.status = lex_element_long_end;
        return FINISH_TOKEN;
    }

    /*
     * 状态机图:
     * [lex_element_short]
     *      -> !strchr("!@#([{}]);,", ch) && iselement(ch) -> (lex_element_short)
     *      -> other -> (lex_element_short) # return FINISH_TOKEN
     */
    int Parser::doneElementShort(char ch) {
        if (!strchr("!@#([{}]);,", ch) && iselement(ch)) {  // 除空格外的可见字符 (不包括NUL)
            setLexicalLast(lex_element_short, TK_ELEMENT_SHORT);
            return 1;
        }
        lexical.status = lex_element_short;
        return FINISH_TOKEN;
    }

    /*
     * 状态机图:
     * [lex_space]
     *      -> ch != NUL && isignore(ch) -> (lex_space)
     *      -> other -> (lex_space) # return FINISH_TOKEN
     */
    int Parser::doneSpace(char ch) {
        if (ch != aFuntool::NUL && isignore(ch)) {
            setLexicalLast(lex_space, TK_SPACE);
            return 1;
        }
        lexical.status = lex_space;
        return FINISH_TOKEN;
    }

    /*
     * 函数名: getTokenFromLexical
     * 目标: 获取Lexical的TokenType以及相关值
     */
    Parser::TokenType Parser::getTokenFromLexical(std::string &text) {
        Parser::TokenType tt;
        int re;
        lexical.status = lex_begin;
        lexical.last = 0;
        text = "";

        if (lexical.is_end)
            return TK_EOF;
        else if (lexical.is_error || reader.isError())  /* lexical和reader出现异常后不再执行 */
            return TK_ERROR;

        while (true) {
            char ch = reader.getChar();
            if (reader.isError())
                return TK_ERROR;

            if (isascii(ch) && iscntrl(ch) && !isspace(ch) && ch != aFuntool::NUL)  // ascii 控制字符
                NULL;  // TODO-szh 给出警告

            switch (lexical.status) {
                case lex_begin:
                    re = doneBegin(ch);
                    break;
                case lex_prefix_block_p:
                case lex_prefix_block_b:
                case lex_prefix_block_c:
                    re = donePrefixBlock(ch);
                    break;
                case lex_comment_before:
                    re = doneCommentBefore(ch);
                    break;
                case lex_element_long:
                    re = doneElementLong(ch);
                    break;
                case lex_mutli_comment:
                    re = doneMutliComment(ch);
                    break;
                case lex_uni_comment:
                    re = doneUniComment(ch);
                    break;
                case lex_mutli_comment_end_before:
                    re = doneMutliCommentBeforeEnd(ch);
                    break;
                case lex_space:
                    re = doneSpace(ch);
                    break;
                case lex_element_short:
                    re = doneElementShort(ch);
                    break;
                case lex_element_long_end:
                    re = doneElementLongEnd(ch);
                    break;
                default:
                    fatalErrorLog(aFunCoreLogger, EXIT_FAILURE, "Switch illegal characters");
                    re = ERROR_TOKEN;
                    break;
            }

            if (re == ERROR_TOKEN) {
                tt = TK_ERROR;
                break;
            } else if (re == DEL_TOKEN) {  // 删除该token, 继续执行
                char *word = reader.readWord(lexical.last);
                free(word);
                lexical.status = lex_begin;
                lexical.last = 0;
                continue;
            } else if (re == FINISH_TOKEN) {
                char *word = reader.readWord(lexical.last);
                if (word == nullptr) {
                    tt = TK_ERROR;
                    break;
                }

                tt = lexical.token;

                if (tt == TK_ELEMENT_SHORT || tt == TK_PREFIX) {
                    text = word;
                    aFuntool::safeFree(word);
                } else if (tt == TK_ELEMENT_LONG) {
                    char *new_str = aFuntool::safeCalloc<char>(strlen(word) - 2 + 1);  // 去除收尾|

                    bool flat = false;
                    char *p = word + 1;
                    size_t count = 0;
                    for (NULL; *p != aFuntool::NUL; p++) {
                        if (*p == '|' && !flat) {  // 跳过第一个 `|`, 如果是末尾|则自然跳过, 若不是则在遇到第二个`|`时写入数据
                            flat = true; /* count不需要递增 */
                            continue;
                        } else if (*p != '|' && flat)  // 遇到错误
                            break;
                        else
                            flat = false;
                        new_str[count] = *p;
                        count++;
                    }

                    text = new_str;
                    free(word);
                    free(new_str);
                } else
                    free(word);

                if (tt == TK_SPACE || tt == TK_COMMENT) {
                    lexical.status = lex_begin;
                    lexical.last = 0;
                    continue;
                } else if (tt == TK_EOF)
                    lexical.is_end = true;

                break;
            }
        }

        return tt;
    }
}