瀏覽代碼

feat: 新增utf-8检查

SongZihuan 3 年之前
父節點
當前提交
d48145b9c1
共有 8 個文件被更改,包括 69 次插入3 次删除
  1. 1 0
      include/tool/file.h
  2. 2 0
      include/tool/log.h
  3. 8 2
      src/core/code.c
  4. 4 0
      src/core/lexical.c
  5. 8 1
      src/core/reader.c
  6. 5 0
      src/core/syntactic.c
  7. 35 0
      src/tool/file.c
  8. 6 0
      src/tool/regex.c

+ 1 - 0
include/tool/file.h

@@ -14,4 +14,5 @@ AFUN_TOOL_EXPORT char *fileNameToVar(char *name, bool need_free);
 AFUN_TOOL_EXPORT char *findPath(char *path, char *env, bool need_free);
 AFUN_TOOL_EXPORT char *getExedir(char *pgm, int dep);
 AFUN_TOOL_EXPORT uintmax_t getFileSize(char *path);
+AFUN_TOOL_EXPORT bool isCharUTF8(char *str);
 #endif //AFUN_FILE_H

+ 2 - 0
include/tool/log.h

@@ -67,5 +67,7 @@ AFUN_TOOL_EXPORT int writeFatalErrorLog_(Logger *logger, char *file, int line, c
 #define assertInfoLog(c, logger, ...) ((c) || writeInfoLog(logger, "Assert " #c " error : " __VA_ARGS__))
 #define assertWarningLog(c, logger, ...) ((c) || writeWarningLog(logger, "Assert " #c " error : " __VA_ARGS__))
 #define assertErrorLog(c, logger, ...) ((c) || writeErrorLog(logger, "Assert " #c " error : " __VA_ARGS__))
+#define assertSendErrorLog(c, logger, ...) ((c) || writeSendErrorLog(logger, "Assert " #c " error : " __VA_ARGS__))
+#define assertFatalErrorLog(c, logger, exit_code, ...) ((c) || writeFatalErrorLog(logger, exit_code, "Assert " #c " error : " __VA_ARGS__))
 
 #endif //AFUN_LOG_H

+ 8 - 2
src/core/code.c

@@ -42,6 +42,9 @@ af_Code *makeElementCode(char *var, char prefix, FileLine line, FilePath path) {
     if (prefix != NUL && strchr(E_PREFIX, prefix) == NULL)
         prefix = NUL;
 
+    if (!isCharUTF8(var))
+        return NULL;
+
     af_Code *bt = makeCode(prefix, line, path);
     bt->type = code_element;
     bt->element.data = strCopy(var);
@@ -555,8 +558,11 @@ char *getCodeMD5(af_Code *code) {
     return md5str;
 }
 
-static bool codeElementCheck(const char *data) {
-    return data != NULL;
+static bool codeElementCheck(char *data) {
+    if (data == NULL)
+        return false;
+
+    return isCharUTF8(data);
 }
 
 /*

+ 4 - 0
src/core/lexical.c

@@ -382,6 +382,9 @@ af_TokenType getTokenFromLexical(char **text, af_Parser *parser) {
 
         if (re == -1) {
             char *word = readWord(parser->lexical->last, parser->reader);
+            if (word == NULL)
+                goto ERROR;
+
             tt = parser->lexical->token;
 
             if (tt == TK_ELEMENT_SHORT || tt == TK_PREFIX)
@@ -425,6 +428,7 @@ af_TokenType getTokenFromLexical(char **text, af_Parser *parser) {
             parser->lexical->last = 0;
             continue;
         } else if (re == -2 || re == -3) {
+ERROR:
             tt = TK_ERROR;
             *text = NULL;
             parser->lexical->is_error = true;

+ 8 - 1
src/core/reader.c

@@ -1,4 +1,5 @@
-#include "__reader.h"
+#include "core_init.h"
+#include "__reader.h"
 
 af_Reader *makeReader(DLC_SYMBOL(readerFunc) read_func, DLC_SYMBOL(destructReaderFunc) destruct_func, size_t data_size) {
     af_Reader *reader = calloc(1, sizeof(af_Reader));
@@ -63,6 +64,12 @@ char *readWord(size_t del_index, af_Reader *reader) {
             reader->line++;
     }
 
+    if (!isCharUTF8(re)) {
+        free(re);
+        writeErrorLog(aFunCoreLogger, "Is not utf-8");
+        return NULL;
+    }
+
     return re;
 }
 

+ 5 - 0
src/core/syntactic.c

@@ -42,6 +42,11 @@ static af_Code *code(size_t deep, char prefix, af_Parser *parser) {
         case TK_ELEMENT_SHORT:
         case TK_ELEMENT_LONG:
             re = makeElementCode(parser->syntactic->text, prefix, parser->reader->line, NULL);
+            if (re == NULL) {
+                writeErrorLog(aFunCoreLogger, "Creat element code error: %s", parser->syntactic->text);
+                freeAllCode(code_list);
+                return NULL;
+            }
             free(parser->syntactic->text);
             break;
         case TK_LP:

+ 35 - 0
src/tool/file.c

@@ -189,3 +189,38 @@ uintmax_t getFileSize(char *path) {
     return (uintmax_t)statbuf.st_size;  // 返回文件大小
 
 }
+
+/*
+ * 函数名: isCharUTF8
+ * 目标: 检查给定字符串是否utf-8编码
+ */
+bool isCharUTF8(char *str) {
+    int code = 0;  // utf-8 多字节数
+    for (char *ch = str; *ch != NUL; ch++) {
+        unsigned char c = *ch;
+        unsigned char c_ = ~c;
+
+        assertFatalErrorLog(code >= 0 && code <= 5, NULL, 2, "str = %s", str);
+        if (code == 0) {
+            if ((c_ & 0xFC) == 0 && (c & 0x02) == 0)  // 检查是否为1111110x, 先对其取反, 使用0xFC掩码检查前6位是否为0, 然后单独检查倒数第二位是否为0
+                code = 5;  // 剩余 5 个字节
+            else if ((c_ & 0xF8) == 0 && (c & 0x04) == 0)
+                code = 4;  // 剩余 4 个字节
+            else if ((c_ & 0xF0) == 0 && (c & 0x08) == 0)
+                code = 3;  // 剩余 3 个字节
+            else if ((c_ & 0xE0) == 0 && (c & 0x10) == 0)
+                code = 2;  // 剩余 2 个字节
+            else if ((c_ & 0xC0) == 0 && (c & 0x20) == 0)
+                code = 1;  // 剩余 1 个字节
+            else if ((c & 0x80) == 0)  // 检查最高位是否为0
+                code = 0;
+            else
+                return false;
+        } else if ((c_ & 0x80) == 0 && (c & 0x40) == 0)
+            code--;
+        else
+            return false;
+    }
+
+    return true;
+}

+ 6 - 0
src/tool/regex.c

@@ -8,6 +8,9 @@
 char regex_error[REGEX_ERROR_SIZE];
 
 af_Regex *makeRegex(char *pattern) {
+    if (!isCharUTF8(pattern))
+        return NULL;
+
     int error_code;
     size_t erroroffset;
     pcre2_code *re = pcre2_compile((PCRE2_SPTR)pattern, PCRE2_ZERO_TERMINATED, 0, &error_code, &erroroffset, NULL);
@@ -39,6 +42,9 @@ void freeRegex(af_Regex *rg) {
  * 返回 (>0) - 失败
  */
 int matchRegex(char *subject, af_Regex *rg) {
+    if (!isCharUTF8(subject))
+        return 0;
+
     PCRE2_SPTR sub = (PCRE2_SPTR)subject;
     PCRE2_SIZE sub_len = strlen(subject);
     pcre2_match_data *match_data = pcre2_match_data_create_from_pattern(rg->re, NULL);