#include "lexer.h" #include "exception.h" #include "support.h" CLexer::CLexer(ILexerCallback* pCallback, bool bCaseSensitive, ELexingMode eLexingMode /*= ELexingMode::lexing_idl*/) : m_pLexerCallback(pCallback), m_bCaseSensitive(bCaseSensitive), m_eLexingMode(eLexingMode), m_vecReservedKeywords(g_vecOmgIdlKeywords) { if (!pCallback) throw CCompileException("Invalid parameter; no callbback provided."); } void CLexer::AddKeyword(const std::string& rssKeyword) { m_vecReservedKeywords.push_back(rssKeyword); } CToken CLexer::GetToken(CCodePos& rCode, const CContextPtr& rptrContext) const { if (!rCode.IsValid()) return CToken(); // Based on the character value decide what to do. For this a tabe is used mapping a character (byte) to a classification. // This table uses the UTF-8 coding scheme. Character values under 0x20 are control and space characters. Character values // between 0x80 and 0xBF are part of a multi-byte character and therefore have no parsing function when found stand-alone. // Character values between 0xC0 and 0xF7 are the initial part of a multi-byte character and therefore are valid participants // of an identifier. Character values between 0xF8 and 0xFF are not defined. // The following abbreviatios are used: // none - not classified // eof - end of file (end of string) // space - whitespace // ident - identfier // lit - literal (number, string) // idlit - identifier or literal // split - separator or literal // sep - separator // oper - operator // opcom - operator or comments // pproc - pre-processor directive or symbol // Remarks: const enum {none, eof, space, ident, lit, idlit, split, sep, oper, opcom, pproc} rgeClassify[256] = { // 0x0 0x1 0x2 0x3 0x4 0x5 0x6 0x7 0x8 0x9 0xA 0xB 0xC 0xD 0xE 0xF eof, none, none, none, none, none, none, none, none, space, space, space, space, space, none, none, // 0x00 none, none, none, none, none, none, none, none, none, none, none, none, none, none, none, none, // 0x10 space, oper, lit, pproc, none, oper, oper, lit, sep, sep, oper, oper, oper, oper, split, opcom, // 0x20 lit, lit, lit, lit, lit, lit, lit, lit, lit, lit, sep, sep, oper, oper, oper, sep, // 0x30 none, ident, ident, ident, ident, ident, idlit, ident, ident, ident, ident, idlit, idlit, ident, idlit, ident, // 0x40 ident, ident, idlit, ident, idlit, idlit, ident, ident, ident, ident, ident, sep, space, sep, oper, ident, // 0x50 none, ident, ident, ident, ident, ident, idlit, ident, ident, ident, ident, ident, ident, ident, idlit, ident, // 0x60 ident, ident, ident, ident, idlit, idlit, ident, ident, ident, ident, ident, sep, oper, sep, oper, none, // 0x70 none, none, none, none, none, none, none, none, none, none, none, none, none, none, none, none, // 0x80 none, none, none, none, none, none, none, none, none, none, none, none, none, none, none, none, // 0x90 none, none, none, none, none, none, none, none, none, none, none, none, none, none, none, none, // 0xA0 none, none, none, none, none, none, none, none, none, none, none, none, none, none, none, none, // 0xB0 ident, ident, ident, ident, ident, ident, ident, ident, ident, ident, ident, ident, ident, ident, ident, ident, // 0xC0 ident, ident, ident, ident, ident, ident, ident, ident, ident, ident, ident, ident, ident, ident, ident, ident, // 0xD0 ident, ident, ident, ident, ident, ident, ident, ident, ident, ident, ident, ident, ident, ident, ident, ident, // 0xE0 ident, ident, ident, ident, ident, ident, ident, ident, none, none, none, none, none, none, none, none // 0xF0 }; // Determine the action to do; repeat if needed CToken token; bool bRetry = false; do { bRetry = false; switch (rgeClassify[static_cast(*rCode)]) { case eof: return CToken(); // End of file/end of string. break; case space: token = GetWhitespace(rCode, m_bNewlineOccurred); token.SetContext(rptrContext); // Callback on the whitespace if (token.GetLength()) { if (m_pLexerCallback) m_pLexerCallback->InsertWhitespace(token); // Continue processing bRetry = true; break; } else token = CToken(); // End of string. break; case ident: token = GetIdentifierOrKeyword(rCode); m_bNewlineOccurred = false; break; case lit: token = GetLiteral(rCode); m_bNewlineOccurred = false; break; case idlit: // Capturing wide, UTF-8, UTF-16, UTF-32 strings and raw strings if ((rCode[1] == '\"' || rCode[1] == '\'') || (std::isalnum(rCode[1]) && (rCode[2] == '\"' || rCode[2] == '\'')) || (std::isalnum(rCode[1]) && std::isalpha(rCode[2]) && (rCode[3] == '\"' || rCode[3] == '\''))) token = GetLiteral(rCode); else { // Check for the literals true, false and nullptr CToken tokenTemp = rCode.GetLocation(); if (static_cast(tokenTemp).substr(0, 4) == "true" && !std::isalnum(rCode[4])) token = GetLiteral(rCode); else if (static_cast(tokenTemp).substr(0, 4) == "TRUE" && !std::isalnum(rCode[4])) token = GetLiteral(rCode); else if (static_cast(tokenTemp).substr(0, 5) == "false" && !std::isalnum(rCode[5])) token = GetLiteral(rCode); else if (static_cast(tokenTemp).substr(0, 5) == "FALSE" && !std::isalnum(rCode[5])) token = GetLiteral(rCode); else if (static_cast(tokenTemp).substr(0, 7) == "nullptr" && !std::isalnum(rCode[7])) token = GetLiteral(rCode); else if (static_cast(tokenTemp).substr(0, 4) == "NULL" && !std::isalnum(rCode[4])) token = GetLiteral(rCode); else token = GetIdentifierOrKeyword(rCode); } m_bNewlineOccurred = false; break; case split: // Capturing floating points starting with a dot if (std::isdigit(rCode[1])) token = GetLiteral(rCode); else token = GetSeparator(rCode); break; case sep: token = GetSeparator(rCode); m_bNewlineOccurred = false; break; case oper: token = GetOperator(rCode); m_bNewlineOccurred = false; break; case opcom: if (rCode[1] == '/' || rCode[1] == '*') { token = GetComments(rCode); token.SetContext(rptrContext); // Callback on the comment. if (m_pLexerCallback) m_pLexerCallback->InsertComment(token); // Continue processing bRetry = true; break; } else { token = GetOperator(rCode); m_bNewlineOccurred = false; } break; case pproc: if (m_eLexingMode != ELexingMode::lexing_preproc) { // The '#' token is only a valid operator when lexing preprocessor directives if (m_bNewlineOccurred) { // Callback on the preprocessor. if (m_pLexerCallback) m_pLexerCallback->ProcessPreprocDirective(rCode); // REMARKS: The preprocessor could include another file. This will change the context of the parser. Return an // empty token to indicate the parser to try to get the next token. token = CToken(); } else throw CCompileException(rCode.GetLocation(), "Unexpected token '#'."); } else { token = GetOperator(rCode); m_bNewlineOccurred = false; } break; case none: default: throw CCompileException(rCode.GetLocation(), "Unexpected token '", *rCode, "'."); break; } } while (bRetry); // Retry when some intermediate processing took place. if (token) { m_tokenLastValid = token; token.SetContext(rptrContext); } return token; } const CToken& CLexer::GetLastValidToken() const { return m_tokenLastValid; } CToken CLexer::GetCustom(CCodePos& rCode, char cSymbol) const { if (!rCode.IsValid()) return CToken(); // Initialize the token CToken token = rCode.GetLocation(); // Run until done bool bDone = false; while (!bDone) { switch (*rCode) { case '\\': if (rCode[1] == '\n') rCode += 2; else if (rCode[1] == '\r' && rCode[2] == '\n') rCode += 3; else ++rCode; break; case '\r': if (m_eLexingMode == ELexingMode::lexing_preproc && rCode[1] == '\n') bDone = true; else ++rCode; break; case '\n': if (m_eLexingMode == ELexingMode::lexing_preproc) bDone = true; else { ++rCode; m_bNewlineOccurred = true; } break; case '\0': bDone = true; break; default: if (*rCode == cSymbol) bDone = true; else ++rCode; break; } } // Update the token rCode.UpdateLocation(token); if (token) m_tokenLastValid = token; return token; } void CLexer::SkipLine(CCodePos& rCode) const { if (!rCode.IsValid()) return; bool bDone = false; while (!bDone) { switch(*rCode) { case '\0': bDone = true; break; case '\r': if (rCode[1] == '\n') { if (m_eLexingMode != ELexingMode::lexing_preproc) { rCode += 2; m_bNewlineOccurred = true; } bDone = true; } else { m_bNewlineOccurred = false; ++rCode; } break; case '\n': if (m_eLexingMode != ELexingMode::lexing_preproc) { ++rCode; m_bNewlineOccurred = true; } bDone = true; break; case '\\': if (rCode[1] == '\n') rCode += 2; else if (rCode[1] == '\r' && rCode[2] == '\n') rCode += 3; else { m_bNewlineOccurred = false; ++rCode; } break; default: m_bNewlineOccurred = false; ++rCode; break; } } } void CLexer::EnablePreprocProcessing() { m_bNewlineOccurred = true; } CToken CLexer::GetWhitespace(CCodePos& rCode, bool& rbNewline) const { if (!rCode.IsValid()) return CToken(); CToken token = rCode.GetLocation(ETokenType::token_whitespace); bool bDone = false; while (!bDone) { // Check the current position. // Space, tab, line feed and carriage return are considered whitespace. switch (*rCode) { case ' ': // Space case '\t': // Tab case '\f': // Form feed case '\v': // Vertical tab ++rCode; break; case '\r': // Carriage return if (rCode[1] == '\n') { if (m_eLexingMode != ELexingMode::lexing_preproc) { rCode += 2; rbNewline = true; } else bDone = true; } else ++rCode; break; case '\n': // Line feed if (m_eLexingMode != ELexingMode::lexing_preproc) { ++rCode; rbNewline = true; } else bDone = true; break; case '\\': if (rCode[1] == '\n') rCode += 2; else if (rCode[1] == '\r' && rCode[2] == '\n') rCode += 3; else bDone = true; break; case '\0': bDone = true; break; default: bDone = true; break; } } // Update the token rCode.UpdateLocation(token); return token; } CToken CLexer::GetComments(CCodePos& rCode) const { if (!rCode.IsValid()) return CToken(); // Initialize the token CToken sToken = rCode.GetLocation(ETokenType::token_comments); // C-style comment? if (rCode[0] == '/' && rCode[1] == '*') { // Skip the comment opening rCode += 2; // Skip characters until comment closing bool bDone = false; bool bEnd = false; while (!bDone) { switch (*rCode) { case '*': if (rCode[1] == '/') { // Skip the comment closing rCode += 2; bEnd = true; bDone = true; } else ++rCode; break; case '\r': if (rCode[1] == '\n') { if (m_eLexingMode != ELexingMode::lexing_preproc) rCode += 2; else bDone = true; } else ++rCode; break; case '\n': if (m_eLexingMode != ELexingMode::lexing_preproc) ++rCode; else bDone = true; break; case '\0': bDone = true; break; default: ++rCode; break; } } // Comment was closed properly? if (bEnd) { // Update the token rCode.UpdateLocation(sToken); } else throw CCompileException(rCode.GetLocation(ETokenType::token_comments), "End of file while looking for a close comment statement."); } else if (rCode[0] == '/' && rCode[1] == '/') // C++-style comment? { // Skip the comment opening rCode += 2; bool bDone = false; while (!bDone) { switch (*rCode) { case '\r': if (rCode[1] == '\n') { if (m_eLexingMode != ELexingMode::lexing_preproc) rCode += 2; bDone = true; } else ++rCode; break; case '\n': if (m_eLexingMode != ELexingMode::lexing_preproc) ++rCode; bDone = true; break; case '\0': bDone = true; break; default: ++rCode; break; } // Update the token rCode.UpdateLocation(sToken); } } else throw CCompileException(rCode.GetLocation(ETokenType::token_comments), "Expecting C-sytel or C++ comments."); return sToken; } CToken CLexer::GetIdentifierOrKeyword(CCodePos& rCode) const { if (!rCode.IsValid()) return CToken(); // Check for an alphabetical character. if (!std::isalpha(*rCode) && *rCode != '_') throw CCompileException(rCode.GetLocation(), "Invalid symbol; expected identifier."); // Initialize two tokens: identifier and keyword (this is not an expensive action) CToken tokenIdentifier = rCode.GetLocation(ETokenType::token_identifier); CToken tokenKeyword = rCode.GetLocation(ETokenType::token_keyword); // Continue until the current character is not alpha-numerical any more while (std::isalnum(*rCode) || *rCode == '_') { ++rCode; } // Update the tokens rCode.UpdateLocation(tokenIdentifier); rCode.UpdateLocation(tokenKeyword); // Special treatment if not lexing preprocessor directives if (m_eLexingMode != ELexingMode::lexing_preproc) { // No quote or apostrophe character allowed to follow directly. if (*rCode == '\"' || *rCode == '\'') throw CCompileException(rCode.GetLocation(), "Invalid characters following identifier."); // Check for a keyword or the collision with a keyword for (const std::string& rssKeyword : m_vecReservedKeywords) { // Check for the token to be exactly the keyword. if (tokenKeyword == rssKeyword) return tokenKeyword; // If not case-sensitive, check for the token to be like the keyword with case differences. if (!m_bCaseSensitive) { const std::string& rssIdentifier = static_cast(tokenIdentifier); if (std::equal(rssIdentifier.begin(), rssIdentifier.end(), rssKeyword.begin(), rssKeyword.end(), [](char a, char b) { return std::tolower(a) == std::tolower(b); })) throw CCompileException(tokenIdentifier, "Identifier collides with reserved keyword (they differ only in case)."); } } } // The identifier is not a keyword and doesn't collide with any keyword. return tokenIdentifier; } CToken CLexer::GetSeparator(CCodePos& rCode) const { if (!rCode.IsValid()) return CToken(); // Check for a separator CToken sToken; switch (*rCode) { case '{': case '}': case '(': case ')': case '[': case ']': case ';': case '.': // Initialize the token sToken = rCode.GetLocation(ETokenType::token_separator); // Skip the separator ++rCode; // Update the token rCode.UpdateLocation(sToken); break; case ':': // Initialize the token sToken = rCode.GetLocation(ETokenType::token_separator); // Skip the separator ++rCode; // Check for another colon - this would be the scope separator if (*rCode == ':') ++rCode; // Update the token rCode.UpdateLocation(sToken); break; default: throw CCompileException(rCode.GetLocation(), "Invalid symbol; expected separator { } ( ) [ ] : :: ; ."); break; } return sToken; } CToken CLexer::GetOperator(CCodePos& rCode) const { if (!rCode.IsValid()) return CToken(); // Check for an operator CToken sToken; switch (*rCode) { case '+': case '-': case '*': case '/': case '%': case '^': case '~': case ',': case '?': // Initialize the token sToken = rCode.GetLocation(ETokenType::token_operator); // Skip the token ++rCode; // Update the token rCode.UpdateLocation(sToken); break; case '#': // Initialize the token sToken = rCode.GetLocation(ETokenType::token_operator); // Skip the number sign ++rCode; // Another number sign might indicate that the operator is a concatinating operator, otherwise it is a stringification // operator. if (*rCode == '#') ++rCode; // Another number sign is not allowed if (*rCode == '#') throw CCompileException(rCode.GetLocation(), "Invalid operator"); // Update the token rCode.UpdateLocation(sToken); break; case '|': case '&': // Initialize the token sToken = rCode.GetLocation(ETokenType::token_operator); // Deal with bitwise and logical operators (latter having two characters). if (rCode[0] == rCode[1]) rCode += 2; else ++rCode; // Update the token rCode.UpdateLocation(sToken); break; case '=': case '!': // Initialize the token sToken = rCode.GetLocation(ETokenType::token_identifier); // Deal with unary and comparison operators (latter having often two characters). if (rCode[1] == '=') rCode += 2; else ++rCode; // Update the token rCode.UpdateLocation(sToken); break; case '<': case '>': // Initialize the token sToken = rCode.GetLocation(ETokenType::token_identifier); // Skip the separator // Deal with bitwise shift and comparison operators having one or two characters. if (rCode[1] == '=' || rCode[0] == rCode[1]) rCode += 2; else ++rCode; // Update the token rCode.UpdateLocation(sToken); break; default: throw CCompileException(rCode.GetLocation(), "Invalid symbol; expected operator = + - * / % ^ ! ~ , << >> | || & && == != < <= > >= # ## ?"); break; } return sToken; } CToken CLexer::GetLiteral(CCodePos& rCode) const { if (!rCode.IsValid()) return CToken(); // Check for different literal types // Integer examples: // 1, 10u, 20U, 010, 0x10L, 0x100LL // Floating point examples: // 1e10, 1e-5L, 1., 1.e-2, 3.14, .1f, 0.1e-1L, 0x1ffp10, 0X0p-1, 0x1.p0, 0xf.p-1, 0x0.123p-1, 0xa.bp10l // String examples: // "...", u8"...", u"...", U"...", L"...", u8R"abc(...)abc" // Character examples: // 'x', u8't', u'b', U'r', L'b', 'abcd', 'L'abcde' CCodePos codeLocal(rCode); ETokenLiteralType eTokenLiteralType = ETokenLiteralType::token_undefined; if (std::isdigit(*codeLocal) || *codeLocal == '.') { // Skip prefix (0x for hex and 0b for binary). const char* szCollection = "0123456789"; // Decimal collection eTokenLiteralType = ETokenLiteralType::token_literal_dec_integer; if (codeLocal[0] == '0' && static_cast(std::tolower(codeLocal[1])) == 'x') { codeLocal += 2; szCollection = "0123456789abcdefABCDEF"; // Hexadecimal collection eTokenLiteralType = ETokenLiteralType::token_literal_hex_integer; } else if (codeLocal[0] == '0' && static_cast(std::tolower(codeLocal[1])) == 'b') { codeLocal += 2; szCollection = "01"; // Binary collection eTokenLiteralType = ETokenLiteralType::token_literal_bin_integer; } else if (codeLocal[0] == '0') { // Prevent identifying float as octal number if (codeLocal[1] != 'e' && codeLocal[1] != 'E' && codeLocal[1] != '.') { szCollection = "01234567"; // Octal collection eTokenLiteralType = ETokenLiteralType::token_literal_oct_integer; } } // Read at least one number (only if not startiung with a dot) if (!std::strchr(szCollection, *codeLocal) && *codeLocal != '.') throw CCompileException(rCode.GetLocation(), "Invalid number literal"); // Read number (with separators) while ((std::strchr(szCollection, *codeLocal) || *codeLocal == '\'') && *codeLocal != '\0') { ++codeLocal; } // Floating or fixed point? // Are detected by . p P e E d D or suffix f or l char cSuffix = static_cast(std::tolower(*codeLocal)); if ((eTokenLiteralType == ETokenLiteralType::token_literal_dec_integer && (cSuffix == '.' || cSuffix == 'e' || cSuffix == 'd')) || (eTokenLiteralType == ETokenLiteralType::token_literal_hex_integer && (cSuffix == '.' || cSuffix == 'p'))) { if (*codeLocal == '.') { // Skip the separator ++codeLocal; // Read the decimal while ((std::strchr(szCollection, *codeLocal) || *codeLocal == '\'') && *codeLocal != '\0') { ++codeLocal; } } // Change the type and check for exponent bool bExponent = false; if (eTokenLiteralType == ETokenLiteralType::token_literal_dec_integer) { eTokenLiteralType = ETokenLiteralType::token_literal_dec_floating_point; if (static_cast(std::tolower(*codeLocal)) == 'e') { bExponent = true; ++codeLocal; } else if (static_cast(std::tolower(*codeLocal)) == 'd') { eTokenLiteralType = ETokenLiteralType::token_literal_fixed_point; ++codeLocal; } } if (eTokenLiteralType == ETokenLiteralType::token_literal_hex_integer) { eTokenLiteralType = ETokenLiteralType::token_literal_hex_floating_point; if (static_cast(std::tolower(*codeLocal)) == 'p') { bExponent = true; ++codeLocal; } } // Process exponent if (bExponent) { // Check for optional sign if (*codeLocal == '+' || *codeLocal == '-') { ++codeLocal; } // A number is compulsory if (!std::isdigit(*codeLocal)) throw CCompileException(rCode.GetLocation(), "Invalid float literal"); // Read the decimal number while (std::isdigit(*codeLocal)) { ++codeLocal; } } // Check for a float suffix (only for floating point) const char* szSuffix = "fFlL"; if (eTokenLiteralType != ETokenLiteralType::token_literal_fixed_point && std::strchr(szSuffix, *codeLocal) && *codeLocal != '\0') { ++codeLocal; } } else { // Read suffix bool bUnsigned = false; if (static_cast(std::tolower(*codeLocal)) == 'u') { ++codeLocal; bUnsigned = true; } if (static_cast(std::tolower(*codeLocal)) == 'l') { // Skip the suffex (could be l, L, ll or LL) if (codeLocal[0] == codeLocal[1]) codeLocal += 2; else ++codeLocal; // Unsigned could also come after the long suffix if (!bUnsigned && (static_cast(std::tolower(*codeLocal)) == 'u')) ++codeLocal; } } } else // Character, string, boolean or nullptr { CToken tokenTemp = codeLocal.GetLocation(); if (static_cast(tokenTemp).substr(0, 4) == "true" && !std::isalnum(codeLocal[4])) { eTokenLiteralType = ETokenLiteralType::token_literal_boolean; codeLocal += 4; } else if (static_cast(tokenTemp).substr(0, 4) == "TRUE" && !std::isalnum(codeLocal[4])) { eTokenLiteralType = ETokenLiteralType::token_literal_boolean; codeLocal += 4; } else if (static_cast(tokenTemp).substr(0, 5) == "false" && !std::isalnum(codeLocal[5])) { eTokenLiteralType = ETokenLiteralType::token_literal_boolean; codeLocal += 5; } else if (static_cast(tokenTemp).substr(0, 5) == "FALSE" && !std::isalnum(codeLocal[5])) { eTokenLiteralType = ETokenLiteralType::token_literal_boolean; codeLocal += 5; } else if (static_cast(tokenTemp).substr(0, 7) == "nullptr" && !std::isalnum(codeLocal[7])) { eTokenLiteralType = ETokenLiteralType::token_literal_nullptr; codeLocal += 7; } else if (static_cast(tokenTemp).substr(0, 4) == "NULL" && !std::isalnum(codeLocal[4])) { eTokenLiteralType = ETokenLiteralType::token_literal_nullptr; codeLocal += 4; } else // check for a string { // Encoding format of characters. enum class ECharEncoding : uint32_t { encoding_ascii, //!< Encoding for ASCII ISO Latin-1 (8859-1) character set encoding_utf8, //!< Encoding for UTF-8 encoding_utf16, //!< Encoding for UTF-16 encoding_utf32, //!< Encoding for UTF-32 encoding_wide //!< Encoding for wide character (platform specific) }; // Check for Unicode pre-fix ECharEncoding eEncoding = ECharEncoding::encoding_ascii; switch (*codeLocal) { case 'u': ++codeLocal; if (*codeLocal == '8') { eEncoding = ECharEncoding::encoding_utf8; ++codeLocal; } else eEncoding = ECharEncoding::encoding_utf16; break; case 'U': ++codeLocal; eEncoding = ECharEncoding::encoding_utf32; break; case 'L': eEncoding = ECharEncoding::encoding_wide; ++codeLocal; break; default: break; } // Check for raw prefix bool bRaw = false; if (*codeLocal == 'R') { bRaw = true; ++codeLocal; } // Check for quote or apostrophe (the beginning of a string or character). bool bIsString = false; bool bIsCharacter = false; if (*codeLocal == '\"') { if (bRaw) eTokenLiteralType = ETokenLiteralType::token_literal_raw_string; else eTokenLiteralType = ETokenLiteralType::token_literal_string; bIsString = true; } else if (*codeLocal == '\'') { if (bRaw) throw CCompileException(rCode.GetLocation(),"No raw character literal support."); if (eEncoding == ECharEncoding::encoding_utf8) throw CCompileException(rCode.GetLocation(), "No UTF-8 character literal support; use ASCII character literal instead."); eTokenLiteralType = ETokenLiteralType::token_literal_character; bIsCharacter = true; } // Process string or character if (bIsString || bIsCharacter) { // Skip left quote or apostrophe ++codeLocal; // If raw check for delimiter pattern (max 16 chars) std::string ssDelimiter; if (bRaw) { // The pattern starts with a right parenthesis. ssDelimiter += ')'; // Detect the (optional) raw pattern for (auto i = 0; i < 16; i++) { if (!*codeLocal) throw CCompileException(rCode.GetLocation(), "Invalid string; unexpected end of file detected."); if (*codeLocal != '(') { ssDelimiter += *codeLocal; ++codeLocal; } else break; } // And add for the end quote ssDelimiter += '\"'; // Raw strings must start with opening bracket if (*codeLocal != '(') throw CCompileException(rCode.GetLocation(), "Invalid string; expecting '('."); // Skip opening bracket ++codeLocal; } else ssDelimiter = bIsCharacter ? "\'" : "\""; // Count the amount of real characters (and skip the amount of read characters). uint32_t uiByteCnt = 0; uint32_t uiCharCnt = 0; switch (eEncoding) { case ECharEncoding::encoding_ascii: { std::string ssText; InterpretCText(codeLocal, ssDelimiter.c_str(), ssText, uiByteCnt, bRaw, true); uiCharCnt = static_cast(ssText.size()); break; } case ECharEncoding::encoding_utf8: { std::string ssText; InterpretCText(codeLocal, ssDelimiter.c_str(), ssText, uiByteCnt, bRaw); uiCharCnt = static_cast(ssText.size()); break; } case ECharEncoding::encoding_utf16: { std::u16string ssText; InterpretCText(codeLocal, ssDelimiter.c_str(), ssText, uiByteCnt, bRaw); uiCharCnt = static_cast(ssText.size()); break; } case ECharEncoding::encoding_utf32: { std::u32string ssText; InterpretCText(codeLocal, ssDelimiter.c_str(), ssText, uiByteCnt, bRaw); uiCharCnt = static_cast(ssText.size()); break; } case ECharEncoding::encoding_wide: { std::wstring ssText; InterpretCText(codeLocal, ssDelimiter.c_str(), ssText, uiByteCnt, bRaw); uiCharCnt = static_cast(ssText.size()); break; } } codeLocal += uiByteCnt; // Skip the delimiter codeLocal += static_cast(ssDelimiter.size()); // Determine whether the character literal is actually a sequence if (bIsCharacter && uiCharCnt > 1) { switch (eEncoding) { case ECharEncoding::encoding_ascii: if (uiCharCnt == 2 || uiCharCnt == 4 || uiCharCnt == 8) eTokenLiteralType = ETokenLiteralType::token_literal_character_sequence; else throw CCompileException(codeLocal.GetLocation(), "Invalid character sequence; only 2, 4 or 8 characters are allowed."); break; case ECharEncoding::encoding_wide: if constexpr (sizeof(wchar_t) == 2) { if (uiCharCnt == 2 || uiCharCnt == 4) eTokenLiteralType = ETokenLiteralType::token_literal_character_sequence; else throw CCompileException(codeLocal.GetLocation(), "Invalid character sequence; only 2 or 4 characters are allowed."); } else // sizeof is 4 { if (uiCharCnt == 2) eTokenLiteralType = ETokenLiteralType::token_literal_character_sequence; else throw CCompileException(codeLocal.GetLocation(), "Invalid character sequence; only 2 characters are allowed."); } break; default: throw CCompileException(codeLocal.GetLocation(), "Character sequences are only allowed with ASCII or wide characters."); } } } else // Not a string, a character, a boolean or a nullptr throw CCompileException(rCode.GetLocation(), "Expecting a literal."); } } // Initialize the token CToken token = rCode.GetLocation(ETokenType::token_literal); // Update the position rCode = codeLocal; // No further digit and alpha-digits are allowed to follow the literal if (m_eLexingMode != ELexingMode::lexing_preproc && (std::isalnum(*rCode) || *rCode == '_')) throw CCompileException(rCode.GetLocation(), "Invalid characters following literal."); // Update the token rCode.UpdateLocation(token, eTokenLiteralType); // Check whether the token is valid if (!token) throw CCompileException(token, "Internal error: the token is not valid."); return token; } CTokenList Tokenize(const char* szCode, const CContextPtr& rptrContext) { CCodePos code(szCode); SLexerDummyCallback sCallback; CLexer lexer(&sCallback, true); CToken token; CTokenList lstTokens; try { while ((token = lexer.GetToken(code, rptrContext))) lstTokens.push_back(std::move(token)); } catch (const sdv::idl::XCompileError &) {} return lstTokens; } CTokenList Tokenize(const std::string& rssCode, const CContextPtr& rptrContext) { return Tokenize(rssCode.c_str(), rptrContext); }