Files

1103 lines
38 KiB
C++
Raw Permalink Normal View History

#include "lexer.h"
#include "exception.h"
#include "support.h"
CLexer::CLexer(ILexerCallback* pCallback, bool bCaseSensitive, ELexingMode eLexingMode /*= ELexingMode::lexing_idl*/) :
m_pLexerCallback(pCallback), m_bCaseSensitive(bCaseSensitive), m_eLexingMode(eLexingMode),
m_vecReservedKeywords(g_vecOmgIdlKeywords)
{
if (!pCallback)
throw CCompileException("Invalid parameter; no callbback provided.");
}
void CLexer::AddKeyword(const std::string& rssKeyword)
{
m_vecReservedKeywords.push_back(rssKeyword);
}
CToken CLexer::GetToken(CCodePos& rCode, const CContextPtr& rptrContext) const
{
if (!rCode.IsValid()) return CToken();
// Based on the character value decide what to do. For this a tabe is used mapping a character (byte) to a classification.
// This table uses the UTF-8 coding scheme. Character values under 0x20 are control and space characters. Character values
// between 0x80 and 0xBF are part of a multi-byte character and therefore have no parsing function when found stand-alone.
// Character values between 0xC0 and 0xF7 are the initial part of a multi-byte character and therefore are valid participants
// of an identifier. Character values between 0xF8 and 0xFF are not defined.
// The following abbreviatios are used:
// none - not classified
// eof - end of file (end of string)
// space - whitespace
// ident - identfier
// lit - literal (number, string)
// idlit - identifier or literal
// split - separator or literal
// sep - separator
// oper - operator
// opcom - operator or comments
// pproc - pre-processor directive or symbol
// Remarks:
const enum {none, eof, space, ident, lit, idlit, split, sep, oper, opcom, pproc} rgeClassify[256] =
{
// 0x0 0x1 0x2 0x3 0x4 0x5 0x6 0x7 0x8 0x9 0xA 0xB 0xC 0xD 0xE 0xF
eof, none, none, none, none, none, none, none, none, space, space, space, space, space, none, none, // 0x00
none, none, none, none, none, none, none, none, none, none, none, none, none, none, none, none, // 0x10
space, oper, lit, pproc, none, oper, oper, lit, sep, sep, oper, oper, oper, oper, split, opcom, // 0x20
lit, lit, lit, lit, lit, lit, lit, lit, lit, lit, sep, sep, oper, oper, oper, sep, // 0x30
none, ident, ident, ident, ident, ident, idlit, ident, ident, ident, ident, idlit, idlit, ident, idlit, ident, // 0x40
ident, ident, idlit, ident, idlit, idlit, ident, ident, ident, ident, ident, sep, space, sep, oper, ident, // 0x50
none, ident, ident, ident, ident, ident, idlit, ident, ident, ident, ident, ident, ident, ident, idlit, ident, // 0x60
ident, ident, ident, ident, idlit, idlit, ident, ident, ident, ident, ident, sep, oper, sep, oper, none, // 0x70
none, none, none, none, none, none, none, none, none, none, none, none, none, none, none, none, // 0x80
none, none, none, none, none, none, none, none, none, none, none, none, none, none, none, none, // 0x90
none, none, none, none, none, none, none, none, none, none, none, none, none, none, none, none, // 0xA0
none, none, none, none, none, none, none, none, none, none, none, none, none, none, none, none, // 0xB0
ident, ident, ident, ident, ident, ident, ident, ident, ident, ident, ident, ident, ident, ident, ident, ident, // 0xC0
ident, ident, ident, ident, ident, ident, ident, ident, ident, ident, ident, ident, ident, ident, ident, ident, // 0xD0
ident, ident, ident, ident, ident, ident, ident, ident, ident, ident, ident, ident, ident, ident, ident, ident, // 0xE0
ident, ident, ident, ident, ident, ident, ident, ident, none, none, none, none, none, none, none, none // 0xF0
};
// Determine the action to do; repeat if needed
CToken token;
bool bRetry = false;
do
{
bRetry = false;
switch (rgeClassify[static_cast<uint32_t>(*rCode)])
{
case eof:
return CToken(); // End of file/end of string.
break;
case space:
token = GetWhitespace(rCode, m_bNewlineOccurred);
token.SetContext(rptrContext);
// Callback on the whitespace
if (token.GetLength())
{
if (m_pLexerCallback)
m_pLexerCallback->InsertWhitespace(token);
// Continue processing
bRetry = true;
break;
}
else
token = CToken(); // End of string.
break;
case ident:
token = GetIdentifierOrKeyword(rCode);
m_bNewlineOccurred = false;
break;
case lit:
token = GetLiteral(rCode);
m_bNewlineOccurred = false;
break;
case idlit:
// Capturing wide, UTF-8, UTF-16, UTF-32 strings and raw strings
if ((rCode[1] == '\"' || rCode[1] == '\'') ||
(std::isalnum(rCode[1]) && (rCode[2] == '\"' || rCode[2] == '\'')) ||
(std::isalnum(rCode[1]) && std::isalpha(rCode[2]) && (rCode[3] == '\"' || rCode[3] == '\'')))
token = GetLiteral(rCode);
else
{
// Check for the literals true, false and nullptr
CToken tokenTemp = rCode.GetLocation();
if (static_cast<std::string>(tokenTemp).substr(0, 4) == "true" && !std::isalnum(rCode[4]))
token = GetLiteral(rCode);
else if (static_cast<std::string>(tokenTemp).substr(0, 4) == "TRUE" && !std::isalnum(rCode[4]))
token = GetLiteral(rCode);
else if (static_cast<std::string>(tokenTemp).substr(0, 5) == "false" && !std::isalnum(rCode[5]))
token = GetLiteral(rCode);
else if (static_cast<std::string>(tokenTemp).substr(0, 5) == "FALSE" && !std::isalnum(rCode[5]))
token = GetLiteral(rCode);
else if (static_cast<std::string>(tokenTemp).substr(0, 7) == "nullptr" && !std::isalnum(rCode[7]))
token = GetLiteral(rCode);
else if (static_cast<std::string>(tokenTemp).substr(0, 4) == "NULL" && !std::isalnum(rCode[4]))
token = GetLiteral(rCode);
else
token = GetIdentifierOrKeyword(rCode);
}
m_bNewlineOccurred = false;
break;
case split:
// Capturing floating points starting with a dot
if (std::isdigit(rCode[1]))
token = GetLiteral(rCode);
else
token = GetSeparator(rCode);
break;
case sep:
token = GetSeparator(rCode);
m_bNewlineOccurred = false;
break;
case oper:
token = GetOperator(rCode);
m_bNewlineOccurred = false;
break;
case opcom:
if (rCode[1] == '/' || rCode[1] == '*')
{
token = GetComments(rCode);
token.SetContext(rptrContext);
// Callback on the comment.
if (m_pLexerCallback)
m_pLexerCallback->InsertComment(token);
// Continue processing
bRetry = true;
break;
}
else
{
token = GetOperator(rCode);
m_bNewlineOccurred = false;
}
break;
case pproc:
if (m_eLexingMode != ELexingMode::lexing_preproc)
{
// The '#' token is only a valid operator when lexing preprocessor directives
if (m_bNewlineOccurred)
{
// Callback on the preprocessor.
if (m_pLexerCallback)
m_pLexerCallback->ProcessPreprocDirective(rCode);
// REMARKS: The preprocessor could include another file. This will change the context of the parser. Return an
// empty token to indicate the parser to try to get the next token.
token = CToken();
}
else
throw CCompileException(rCode.GetLocation(), "Unexpected token '#'.");
}
else
{
token = GetOperator(rCode);
m_bNewlineOccurred = false;
}
break;
case none:
default:
throw CCompileException(rCode.GetLocation(), "Unexpected token '", *rCode, "'.");
break;
}
} while (bRetry); // Retry when some intermediate processing took place.
if (token)
{
m_tokenLastValid = token;
token.SetContext(rptrContext);
}
return token;
}
const CToken& CLexer::GetLastValidToken() const
{
return m_tokenLastValid;
}
CToken CLexer::GetCustom(CCodePos& rCode, char cSymbol) const
{
if (!rCode.IsValid()) return CToken();
// Initialize the token
CToken token = rCode.GetLocation();
// Run until done
bool bDone = false;
while (!bDone)
{
switch (*rCode)
{
case '\\':
if (rCode[1] == '\n')
rCode += 2;
else if (rCode[1] == '\r' && rCode[2] == '\n')
rCode += 3;
else
++rCode;
break;
case '\r':
if (m_eLexingMode == ELexingMode::lexing_preproc && rCode[1] == '\n')
bDone = true;
else
++rCode;
break;
case '\n':
if (m_eLexingMode == ELexingMode::lexing_preproc)
bDone = true;
else
{
++rCode;
m_bNewlineOccurred = true;
}
break;
case '\0':
bDone = true;
break;
default:
if (*rCode == cSymbol)
bDone = true;
else
++rCode;
break;
}
}
// Update the token
rCode.UpdateLocation(token);
if (token) m_tokenLastValid = token;
return token;
}
void CLexer::SkipLine(CCodePos& rCode) const
{
if (!rCode.IsValid()) return;
bool bDone = false;
while (!bDone)
{
switch(*rCode)
{
case '\0':
bDone = true;
break;
case '\r':
if (rCode[1] == '\n')
{
if (m_eLexingMode != ELexingMode::lexing_preproc)
{
rCode += 2;
m_bNewlineOccurred = true;
}
bDone = true;
}
else
{
m_bNewlineOccurred = false;
++rCode;
}
break;
case '\n':
if (m_eLexingMode != ELexingMode::lexing_preproc)
{
++rCode;
m_bNewlineOccurred = true;
}
bDone = true;
break;
case '\\':
if (rCode[1] == '\n')
rCode += 2;
else if (rCode[1] == '\r' && rCode[2] == '\n')
rCode += 3;
else
{
m_bNewlineOccurred = false;
++rCode;
}
break;
default:
m_bNewlineOccurred = false;
++rCode;
break;
}
}
}
void CLexer::EnablePreprocProcessing()
{
m_bNewlineOccurred = true;
}
CToken CLexer::GetWhitespace(CCodePos& rCode, bool& rbNewline) const
{
if (!rCode.IsValid()) return CToken();
CToken token = rCode.GetLocation(ETokenType::token_whitespace);
bool bDone = false;
while (!bDone)
{
// Check the current position.
// Space, tab, line feed and carriage return are considered whitespace.
switch (*rCode)
{
case ' ': // Space
case '\t': // Tab
case '\f': // Form feed
case '\v': // Vertical tab
++rCode;
break;
case '\r': // Carriage return
if (rCode[1] == '\n')
{
if (m_eLexingMode != ELexingMode::lexing_preproc)
{
rCode += 2;
rbNewline = true;
} else
bDone = true;
} else
++rCode;
break;
case '\n': // Line feed
if (m_eLexingMode != ELexingMode::lexing_preproc)
{
++rCode;
rbNewline = true;
} else
bDone = true;
break;
case '\\':
if (rCode[1] == '\n')
rCode += 2;
else if (rCode[1] == '\r' && rCode[2] == '\n')
rCode += 3;
else
bDone = true;
break;
case '\0':
bDone = true;
break;
default:
bDone = true;
break;
}
}
// Update the token
rCode.UpdateLocation(token);
return token;
}
CToken CLexer::GetComments(CCodePos& rCode) const
{
if (!rCode.IsValid()) return CToken();
// Initialize the token
CToken sToken = rCode.GetLocation(ETokenType::token_comments);
// C-style comment?
if (rCode[0] == '/' && rCode[1] == '*')
{
// Skip the comment opening
rCode += 2;
// Skip characters until comment closing
bool bDone = false;
bool bEnd = false;
while (!bDone)
{
switch (*rCode)
{
case '*':
if (rCode[1] == '/')
{
// Skip the comment closing
rCode += 2;
bEnd = true;
bDone = true;
}
else
++rCode;
break;
case '\r':
if (rCode[1] == '\n')
{
if (m_eLexingMode != ELexingMode::lexing_preproc)
rCode += 2;
else
bDone = true;
} else
++rCode;
break;
case '\n':
if (m_eLexingMode != ELexingMode::lexing_preproc)
++rCode;
else
bDone = true;
break;
case '\0':
bDone = true;
break;
default:
++rCode;
break;
}
}
// Comment was closed properly?
if (bEnd)
{
// Update the token
rCode.UpdateLocation(sToken);
} else
throw CCompileException(rCode.GetLocation(ETokenType::token_comments), "End of file while looking for a close comment statement.");
}
else if (rCode[0] == '/' && rCode[1] == '/') // C++-style comment?
{
// Skip the comment opening
rCode += 2;
bool bDone = false;
while (!bDone)
{
switch (*rCode)
{
case '\r':
if (rCode[1] == '\n')
{
if (m_eLexingMode != ELexingMode::lexing_preproc)
rCode += 2;
bDone = true;
} else
++rCode;
break;
case '\n':
if (m_eLexingMode != ELexingMode::lexing_preproc)
++rCode;
bDone = true;
break;
case '\0':
bDone = true;
break;
default:
++rCode;
break;
}
// Update the token
rCode.UpdateLocation(sToken);
}
} else
throw CCompileException(rCode.GetLocation(ETokenType::token_comments), "Expecting C-sytel or C++ comments.");
return sToken;
}
CToken CLexer::GetIdentifierOrKeyword(CCodePos& rCode) const
{
if (!rCode.IsValid()) return CToken();
// Check for an alphabetical character.
if (!std::isalpha(*rCode) && *rCode != '_')
throw CCompileException(rCode.GetLocation(), "Invalid symbol; expected identifier.");
// Initialize two tokens: identifier and keyword (this is not an expensive action)
CToken tokenIdentifier = rCode.GetLocation(ETokenType::token_identifier);
CToken tokenKeyword = rCode.GetLocation(ETokenType::token_keyword);
// Continue until the current character is not alpha-numerical any more
while (std::isalnum(*rCode) || *rCode == '_')
{
++rCode;
}
// Update the tokens
rCode.UpdateLocation(tokenIdentifier);
rCode.UpdateLocation(tokenKeyword);
// Special treatment if not lexing preprocessor directives
if (m_eLexingMode != ELexingMode::lexing_preproc)
{
// No quote or apostrophe character allowed to follow directly.
if (*rCode == '\"' || *rCode == '\'')
throw CCompileException(rCode.GetLocation(), "Invalid characters following identifier.");
// Check for a keyword or the collision with a keyword
for (const std::string& rssKeyword : m_vecReservedKeywords)
{
// Check for the token to be exactly the keyword.
if (tokenKeyword == rssKeyword)
return tokenKeyword;
// If not case-sensitive, check for the token to be like the keyword with case differences.
if (!m_bCaseSensitive)
{
const std::string& rssIdentifier = static_cast<std::string>(tokenIdentifier);
if (std::equal(rssIdentifier.begin(), rssIdentifier.end(),
rssKeyword.begin(), rssKeyword.end(),
[](char a, char b) { return std::tolower(a) == std::tolower(b); }))
throw CCompileException(tokenIdentifier, "Identifier collides with reserved keyword (they differ only in case).");
}
}
}
// The identifier is not a keyword and doesn't collide with any keyword.
return tokenIdentifier;
}
CToken CLexer::GetSeparator(CCodePos& rCode) const
{
if (!rCode.IsValid()) return CToken();
// Check for a separator
CToken sToken;
switch (*rCode)
{
case '{':
case '}':
case '(':
case ')':
case '[':
case ']':
case ';':
case '.':
// Initialize the token
sToken = rCode.GetLocation(ETokenType::token_separator);
// Skip the separator
++rCode;
// Update the token
rCode.UpdateLocation(sToken);
break;
case ':':
// Initialize the token
sToken = rCode.GetLocation(ETokenType::token_separator);
// Skip the separator
++rCode;
// Check for another colon - this would be the scope separator
if (*rCode == ':')
++rCode;
// Update the token
rCode.UpdateLocation(sToken);
break;
default:
throw CCompileException(rCode.GetLocation(), "Invalid symbol; expected separator { } ( ) [ ] : :: ; .");
break;
}
return sToken;
}
CToken CLexer::GetOperator(CCodePos& rCode) const
{
if (!rCode.IsValid()) return CToken();
// Check for an operator
CToken sToken;
switch (*rCode)
{
case '+':
case '-':
case '*':
case '/':
case '%':
case '^':
case '~':
case ',':
case '?':
// Initialize the token
sToken = rCode.GetLocation(ETokenType::token_operator);
// Skip the token
++rCode;
// Update the token
rCode.UpdateLocation(sToken);
break;
case '#':
// Initialize the token
sToken = rCode.GetLocation(ETokenType::token_operator);
// Skip the number sign
++rCode;
// Another number sign might indicate that the operator is a concatinating operator, otherwise it is a stringification
// operator.
if (*rCode == '#')
++rCode;
// Another number sign is not allowed
if (*rCode == '#')
throw CCompileException(rCode.GetLocation(), "Invalid operator");
// Update the token
rCode.UpdateLocation(sToken);
break;
case '|':
case '&':
// Initialize the token
sToken = rCode.GetLocation(ETokenType::token_operator);
// Deal with bitwise and logical operators (latter having two characters).
if (rCode[0] == rCode[1])
rCode += 2;
else
++rCode;
// Update the token
rCode.UpdateLocation(sToken);
break;
case '=':
case '!':
// Initialize the token
sToken = rCode.GetLocation(ETokenType::token_identifier);
// Deal with unary and comparison operators (latter having often two characters).
if (rCode[1] == '=')
rCode += 2;
else
++rCode;
// Update the token
rCode.UpdateLocation(sToken);
break;
case '<':
case '>':
// Initialize the token
sToken = rCode.GetLocation(ETokenType::token_identifier);
// Skip the separator
// Deal with bitwise shift and comparison operators having one or two characters.
if (rCode[1] == '=' || rCode[0] == rCode[1])
rCode += 2;
else
++rCode;
// Update the token
rCode.UpdateLocation(sToken);
break;
default:
throw CCompileException(rCode.GetLocation(),
"Invalid symbol; expected operator = + - * / % ^ ! ~ , << >> | || & && == != < <= > >= # ## ?");
break;
}
return sToken;
}
CToken CLexer::GetLiteral(CCodePos& rCode) const
{
if (!rCode.IsValid()) return CToken();
// Check for different literal types
// Integer examples:
// 1, 10u, 20U, 010, 0x10L, 0x100LL
// Floating point examples:
// 1e10, 1e-5L, 1., 1.e-2, 3.14, .1f, 0.1e-1L, 0x1ffp10, 0X0p-1, 0x1.p0, 0xf.p-1, 0x0.123p-1, 0xa.bp10l
// String examples:
// "...", u8"...", u"...", U"...", L"...", u8R"abc(...)abc"
// Character examples:
// 'x', u8't', u'b', U'r', L'b', 'abcd', 'L'abcde'
CCodePos codeLocal(rCode);
ETokenLiteralType eTokenLiteralType = ETokenLiteralType::token_undefined;
if (std::isdigit(*codeLocal) || *codeLocal == '.')
{
// Skip prefix (0x for hex and 0b for binary).
const char* szCollection = "0123456789"; // Decimal collection
eTokenLiteralType = ETokenLiteralType::token_literal_dec_integer;
if (codeLocal[0] == '0' && static_cast<char>(std::tolower(codeLocal[1])) == 'x')
{
codeLocal += 2;
szCollection = "0123456789abcdefABCDEF"; // Hexadecimal collection
eTokenLiteralType = ETokenLiteralType::token_literal_hex_integer;
} else if (codeLocal[0] == '0' && static_cast<char>(std::tolower(codeLocal[1])) == 'b')
{
codeLocal += 2;
szCollection = "01"; // Binary collection
eTokenLiteralType = ETokenLiteralType::token_literal_bin_integer;
}
else if (codeLocal[0] == '0')
{
// Prevent identifying float as octal number
if (codeLocal[1] != 'e' && codeLocal[1] != 'E' && codeLocal[1] != '.')
{
szCollection = "01234567"; // Octal collection
eTokenLiteralType = ETokenLiteralType::token_literal_oct_integer;
}
}
// Read at least one number (only if not startiung with a dot)
if (!std::strchr(szCollection, *codeLocal) && *codeLocal != '.')
throw CCompileException(rCode.GetLocation(), "Invalid number literal");
// Read number (with separators)
while ((std::strchr(szCollection, *codeLocal) || *codeLocal == '\'') && *codeLocal != '\0')
{
++codeLocal;
}
// Floating or fixed point?
// Are detected by . p P e E d D or suffix f or l
char cSuffix = static_cast<char>(std::tolower(*codeLocal));
if ((eTokenLiteralType == ETokenLiteralType::token_literal_dec_integer &&
(cSuffix == '.' || cSuffix == 'e' || cSuffix == 'd')) ||
(eTokenLiteralType == ETokenLiteralType::token_literal_hex_integer && (cSuffix == '.' || cSuffix == 'p')))
{
if (*codeLocal == '.')
{
// Skip the separator
++codeLocal;
// Read the decimal
while ((std::strchr(szCollection, *codeLocal) || *codeLocal == '\'') && *codeLocal != '\0')
{
++codeLocal;
}
}
// Change the type and check for exponent
bool bExponent = false;
if (eTokenLiteralType == ETokenLiteralType::token_literal_dec_integer)
{
eTokenLiteralType = ETokenLiteralType::token_literal_dec_floating_point;
if (static_cast<char>(std::tolower(*codeLocal)) == 'e')
{
bExponent = true;
++codeLocal;
}
else if (static_cast<char>(std::tolower(*codeLocal)) == 'd')
{
eTokenLiteralType = ETokenLiteralType::token_literal_fixed_point;
++codeLocal;
}
}
if (eTokenLiteralType == ETokenLiteralType::token_literal_hex_integer)
{
eTokenLiteralType = ETokenLiteralType::token_literal_hex_floating_point;
if (static_cast<char>(std::tolower(*codeLocal)) == 'p')
{
bExponent = true;
++codeLocal;
}
}
// Process exponent
if (bExponent)
{
// Check for optional sign
if (*codeLocal == '+' || *codeLocal == '-')
{
++codeLocal;
}
// A number is compulsory
if (!std::isdigit(*codeLocal))
throw CCompileException(rCode.GetLocation(), "Invalid float literal");
// Read the decimal number
while (std::isdigit(*codeLocal))
{
++codeLocal;
}
}
// Check for a float suffix (only for floating point)
const char* szSuffix = "fFlL";
if (eTokenLiteralType != ETokenLiteralType::token_literal_fixed_point &&
std::strchr(szSuffix, *codeLocal) && *codeLocal != '\0')
{
++codeLocal;
}
}
else
{
// Read suffix
bool bUnsigned = false;
if (static_cast<char>(std::tolower(*codeLocal)) == 'u')
{
++codeLocal;
bUnsigned = true;
}
if (static_cast<char>(std::tolower(*codeLocal)) == 'l')
{
// Skip the suffex (could be l, L, ll or LL)
if (codeLocal[0] == codeLocal[1])
codeLocal += 2;
else
++codeLocal;
// Unsigned could also come after the long suffix
if (!bUnsigned && (static_cast<char>(std::tolower(*codeLocal)) == 'u'))
++codeLocal;
}
}
}
else // Character, string, boolean or nullptr
{
CToken tokenTemp = codeLocal.GetLocation();
if (static_cast<std::string>(tokenTemp).substr(0, 4) == "true" && !std::isalnum(codeLocal[4]))
{
eTokenLiteralType = ETokenLiteralType::token_literal_boolean;
codeLocal += 4;
}
else if (static_cast<std::string>(tokenTemp).substr(0, 4) == "TRUE" && !std::isalnum(codeLocal[4]))
{
eTokenLiteralType = ETokenLiteralType::token_literal_boolean;
codeLocal += 4;
}
else if (static_cast<std::string>(tokenTemp).substr(0, 5) == "false" && !std::isalnum(codeLocal[5]))
{
eTokenLiteralType = ETokenLiteralType::token_literal_boolean;
codeLocal += 5;
}
else if (static_cast<std::string>(tokenTemp).substr(0, 5) == "FALSE" && !std::isalnum(codeLocal[5]))
{
eTokenLiteralType = ETokenLiteralType::token_literal_boolean;
codeLocal += 5;
}
else if (static_cast<std::string>(tokenTemp).substr(0, 7) == "nullptr" && !std::isalnum(codeLocal[7]))
{
eTokenLiteralType = ETokenLiteralType::token_literal_nullptr;
codeLocal += 7;
}
else if (static_cast<std::string>(tokenTemp).substr(0, 4) == "NULL" && !std::isalnum(codeLocal[4]))
{
eTokenLiteralType = ETokenLiteralType::token_literal_nullptr;
codeLocal += 4;
}
else // check for a string
{
// Encoding format of characters.
enum class ECharEncoding : uint32_t
{
encoding_ascii, //!< Encoding for ASCII ISO Latin-1 (8859-1) character set
encoding_utf8, //!< Encoding for UTF-8
encoding_utf16, //!< Encoding for UTF-16
encoding_utf32, //!< Encoding for UTF-32
encoding_wide //!< Encoding for wide character (platform specific)
};
// Check for Unicode pre-fix
ECharEncoding eEncoding = ECharEncoding::encoding_ascii;
switch (*codeLocal)
{
case 'u':
++codeLocal;
if (*codeLocal == '8')
{
eEncoding = ECharEncoding::encoding_utf8;
++codeLocal;
} else
eEncoding = ECharEncoding::encoding_utf16;
break;
case 'U':
++codeLocal;
eEncoding = ECharEncoding::encoding_utf32;
break;
case 'L':
eEncoding = ECharEncoding::encoding_wide;
++codeLocal;
break;
default:
break;
}
// Check for raw prefix
bool bRaw = false;
if (*codeLocal == 'R')
{
bRaw = true;
++codeLocal;
}
// Check for quote or apostrophe (the beginning of a string or character).
bool bIsString = false;
bool bIsCharacter = false;
if (*codeLocal == '\"')
{
if (bRaw)
eTokenLiteralType = ETokenLiteralType::token_literal_raw_string;
else
eTokenLiteralType = ETokenLiteralType::token_literal_string;
bIsString = true;
} else if (*codeLocal == '\'')
{
if (bRaw)
throw CCompileException(rCode.GetLocation(),"No raw character literal support.");
if (eEncoding == ECharEncoding::encoding_utf8)
throw CCompileException(rCode.GetLocation(),
"No UTF-8 character literal support; use ASCII character literal instead.");
eTokenLiteralType = ETokenLiteralType::token_literal_character;
bIsCharacter = true;
}
// Process string or character
if (bIsString || bIsCharacter)
{
// Skip left quote or apostrophe
++codeLocal;
// If raw check for delimiter pattern (max 16 chars)
std::string ssDelimiter;
if (bRaw)
{
// The pattern starts with a right parenthesis.
ssDelimiter += ')';
// Detect the (optional) raw pattern
for (auto i = 0; i < 16; i++)
{
if (!*codeLocal)
throw CCompileException(rCode.GetLocation(), "Invalid string; unexpected end of file detected.");
if (*codeLocal != '(')
{
ssDelimiter += *codeLocal;
++codeLocal;
}
else
break;
}
// And add for the end quote
ssDelimiter += '\"';
// Raw strings must start with opening bracket
if (*codeLocal != '(')
throw CCompileException(rCode.GetLocation(), "Invalid string; expecting '('.");
// Skip opening bracket
++codeLocal;
} else
ssDelimiter = bIsCharacter ? "\'" : "\"";
// Count the amount of real characters (and skip the amount of read characters).
uint32_t uiByteCnt = 0;
uint32_t uiCharCnt = 0;
switch (eEncoding)
{
case ECharEncoding::encoding_ascii:
{
std::string ssText;
InterpretCText(codeLocal, ssDelimiter.c_str(), ssText, uiByteCnt, bRaw, true);
uiCharCnt = static_cast<uint32_t>(ssText.size());
break;
}
case ECharEncoding::encoding_utf8:
{
std::string ssText;
InterpretCText(codeLocal, ssDelimiter.c_str(), ssText, uiByteCnt, bRaw);
uiCharCnt = static_cast<uint32_t>(ssText.size());
break;
}
case ECharEncoding::encoding_utf16:
{
std::u16string ssText;
InterpretCText(codeLocal, ssDelimiter.c_str(), ssText, uiByteCnt, bRaw);
uiCharCnt = static_cast<uint32_t>(ssText.size());
break;
}
case ECharEncoding::encoding_utf32:
{
std::u32string ssText;
InterpretCText(codeLocal, ssDelimiter.c_str(), ssText, uiByteCnt, bRaw);
uiCharCnt = static_cast<uint32_t>(ssText.size());
break;
}
case ECharEncoding::encoding_wide:
{
std::wstring ssText;
InterpretCText(codeLocal, ssDelimiter.c_str(), ssText, uiByteCnt, bRaw);
uiCharCnt = static_cast<uint32_t>(ssText.size());
break;
}
}
codeLocal += uiByteCnt;
// Skip the delimiter
codeLocal += static_cast<uint32_t>(ssDelimiter.size());
// Determine whether the character literal is actually a sequence
if (bIsCharacter && uiCharCnt > 1)
{
switch (eEncoding)
{
case ECharEncoding::encoding_ascii:
if (uiCharCnt == 2 || uiCharCnt == 4 || uiCharCnt == 8)
eTokenLiteralType = ETokenLiteralType::token_literal_character_sequence;
else
throw CCompileException(codeLocal.GetLocation(),
"Invalid character sequence; only 2, 4 or 8 characters are allowed.");
break;
case ECharEncoding::encoding_wide:
if constexpr (sizeof(wchar_t) == 2)
{
if (uiCharCnt == 2 || uiCharCnt == 4)
eTokenLiteralType = ETokenLiteralType::token_literal_character_sequence;
else
throw CCompileException(codeLocal.GetLocation(),
"Invalid character sequence; only 2 or 4 characters are allowed.");
}
else // sizeof is 4
{
if (uiCharCnt == 2)
eTokenLiteralType = ETokenLiteralType::token_literal_character_sequence;
else
throw CCompileException(codeLocal.GetLocation(),
"Invalid character sequence; only 2 characters are allowed.");
}
break;
default:
throw CCompileException(codeLocal.GetLocation(),
"Character sequences are only allowed with ASCII or wide characters.");
}
}
}
else // Not a string, a character, a boolean or a nullptr
throw CCompileException(rCode.GetLocation(), "Expecting a literal.");
}
}
// Initialize the token
CToken token = rCode.GetLocation(ETokenType::token_literal);
// Update the position
rCode = codeLocal;
// No further digit and alpha-digits are allowed to follow the literal
if (m_eLexingMode != ELexingMode::lexing_preproc && (std::isalnum(*rCode) || *rCode == '_'))
throw CCompileException(rCode.GetLocation(), "Invalid characters following literal.");
// Update the token
rCode.UpdateLocation(token, eTokenLiteralType);
// Check whether the token is valid
if (!token)
throw CCompileException(token, "Internal error: the token is not valid.");
return token;
}
CTokenList Tokenize(const char* szCode, const CContextPtr& rptrContext)
{
CCodePos code(szCode);
SLexerDummyCallback sCallback;
CLexer lexer(&sCallback, true);
CToken token;
CTokenList lstTokens;
try
{
while ((token = lexer.GetToken(code, rptrContext)))
lstTokens.push_back(std::move(token));
}
catch (const sdv::idl::XCompileError &)
{}
return lstTokens;
}
CTokenList Tokenize(const std::string& rssCode, const CContextPtr& rptrContext)
{
return Tokenize(rssCode.c_str(), rptrContext);
}