IFPSTools.NET/LibIFPSCC/Scanner/Char.cs
zc adc9a2dde1 cc: add timer for each stage (via compile time config)
cc: optimise lexer for speed (~10x speed improvement)
cc: implement OperatorParser class as per TODO in original codebase (seems refactoring chains of parser-combinators leads to at least SOME speed improvement)
2023-07-11 17:57:57 +01:00

379 lines
12 KiB
C#

using System;
using System.Text;
namespace LexicalAnalysis {
/// <summary>
/// A character constant
/// </summary>
public sealed class TokenCharConst : Token {
public TokenCharConst(String raw, Char value) {
this.Raw = raw;
this.Value = value;
}
public override TokenKind Kind { get; } = TokenKind.CHAR;
public String Raw { get; }
public Char Value { get; }
public override String ToString() => $"{this.Kind} [{Line}:{Column}]: '{this.Raw}'";
}
/// <summary>
/// The FSA for scanning a C character.
/// Note that this FSA doesn't scan the surrounding quotes.
/// It is used in both FSACharConst and FSAString.
///
/// There are multiple ways to represent a character:
/// * A normal character : any character other than \\ \n or [quote]
/// Note that [quote] might be \' or \" depending on the context.
/// For example, inside a String, single quote are allowed, which means that the following code is legal:
/// Char *str = "single quote here: ' see that?";
///
/// However, if we need a double quote inside a String, we have to use an escape character, like this:
/// Char *str = "double quote needs to be escaped: \" ";
///
/// Inside a Char, double quotes are allowed while single quotes need to be escaped.
/// Char double_quote = '"'; // allowed
/// Char single_quote = '\''; // needs to be escaped
///
/// * An escape character : \a \b \f \n \r \t \v \' \" \\ \?
/// Note that even though \' and \" might not needs to be escaped, you can always use them as escaped.
/// If you escape a character not listed above, the behavior is undefined in the standard.
/// I'll just assume you need the unescaped character.
/// For example, if you typed '\c', then I'll just treat it as 'c'.
///
/// * An octal number after a backslash. For example : \123.
///
/// * A hexadecimal number after a backslash and an 'x' or 'X'. FOr example : \xFF.
///
/// </summary>
public sealed class FSAChar : FSA {
private enum State {
START,
END,
ERROR,
S,
C,
SO,
SOO,
SOOO,
SX,
SXH,
SXHH
}
private State _state;
private StringBuilder _scanned;
// quote : Char
// ============
// \' in a Char, and \" in a String.
private readonly Char _quote;
public FSAChar(Char quote) {
this._state = State.START;
this._quote = quote;
this._scanned = new StringBuilder();
}
public override void Reset() {
this._scanned.Clear();
this._state = State.START;
}
public override FSAStatus GetStatus() {
if (this._state == State.START) {
return FSAStatus.NONE;
}
if (this._state == State.END) {
return FSAStatus.END;
}
if (this._state == State.ERROR) {
return FSAStatus.ERROR;
}
return FSAStatus.RUNNING;
}
// IsChar : Char -> Boolean
// ========================
// the character is a 'normal' Char, other than <quote> \\ or \n
//
private Boolean IsChar(Char ch) {
return ch != this._quote && ch != '\\' && ch != '\n';
}
// RetrieveRaw : () -> String
// ==========================
//
public String RetrieveRaw() {
return this._scanned.ToString(0, this._scanned.Length - 1);
}
// RetrieveChar : () -> Char
// =========================
//
public Char RetrieveChar() {
if (this._scanned.Length == 3) {
switch (this._scanned[1]) {
case 'a':
return '\a';
case 'b':
return '\b';
case 'f':
return '\f';
case 'n':
return '\n';
case 'r':
return '\r';
case 't':
return '\t';
case 'v':
return '\v';
case '\'':
return '\'';
case '\"':
return '\"';
case '\\':
return '\\';
case '?':
return '?';
default:
return this._scanned[1];
}
}
return this._scanned[0];
}
// RetrieveToken : () -> Token
// ===========================
// Note that this function never gets used, because FSAChar is just an inner FSA for other FSAs.
//
public override Token RetrieveToken() {
return new EmptyToken();
}
// ReadChar : Char -> ()
// =====================
// Implementation of the FSA
//
public override void ReadChar(Char ch) {
this._scanned = this._scanned.Append(ch);
switch (this._state) {
case State.END:
case State.ERROR:
this._state = State.ERROR;
break;
case State.START:
if (IsChar(ch)) {
this._state = State.C;
} else if (ch == '\\') {
this._state = State.S;
} else {
this._state = State.ERROR;
}
break;
case State.C:
this._state = State.END;
break;
case State.S:
if (Utils.IsEscapeChar(ch)) {
this._state = State.C;
} else if (Utils.IsOctDigit(ch)) {
this._state = State.SO;
} else if (ch == 'x' || ch == 'X') {
this._state = State.SX;
} else {
this._state = State.ERROR;
}
break;
case State.SX:
if (Utils.IsHexDigit(ch)) {
this._state = State.SXH;
} else {
this._state = State.ERROR;
}
break;
case State.SXH:
if (Utils.IsHexDigit(ch)) {
this._state = State.SXHH;
} else {
this._state = State.END;
}
break;
case State.SXHH:
this._state = State.END;
break;
case State.SO:
if (Utils.IsOctDigit(ch)) {
this._state = State.SOO;
} else {
this._state = State.END;
}
break;
case State.SOO:
if (Utils.IsOctDigit(ch)) {
this._state = State.SOOO;
} else {
this._state = State.END;
}
break;
case State.SOOO:
this._state = State.END;
break;
default:
this._state = State.ERROR;
break;
}
}
// ReadEOF : () -> ()
// ==================
//
public override void ReadEOF() {
this._scanned = this._scanned.Append('0');
switch (this._state) {
case State.C:
case State.SO:
case State.SOO:
case State.SOOO:
case State.SXH:
case State.SXHH:
this._state = State.END;
break;
default:
this._state = State.ERROR;
break;
}
}
}
/// <summary>
/// FSACharConst
/// ============
/// The FSA for scanning a C character constant.
/// Upon finish, we can retrive a token of character.
///
/// A character constant can either be represented by
/// * '[char]'
/// or
/// * L'[char]'
///
/// The character inside the quotes is read by FSAChar.
/// Note that if the inner character is a single quote, it needs to be escaped.
/// </summary>
public sealed class FSACharConst : FSA {
private enum State {
START,
END,
ERROR,
L,
Q,
QC,
QCQ
};
private State _state;
private Char _val;
private String _raw;
private readonly FSAChar _fsachar;
public FSACharConst() {
this._state = State.START;
this._fsachar = new FSAChar('\'');
}
public override void Reset() {
this._state = State.START;
this._fsachar.Reset();
}
public override FSAStatus GetStatus() {
if (this._state == State.START) {
return FSAStatus.NONE;
}
if (this._state == State.END) {
return FSAStatus.END;
}
if (this._state == State.ERROR) {
return FSAStatus.ERROR;
}
return FSAStatus.RUNNING;
}
public override Token RetrieveToken() {
return new TokenCharConst(this._raw, this._val);
}
public override void ReadChar(Char ch) {
switch (this._state) {
case State.END:
case State.ERROR:
this._state = State.ERROR;
break;
case State.START:
switch (ch) {
case 'L':
this._state = State.L;
break;
case '\'':
this._state = State.Q;
this._fsachar.Reset();
break;
default:
this._state = State.ERROR;
break;
}
break;
case State.L:
if (ch == '\'') {
this._state = State.Q;
this._fsachar.Reset();
} else {
this._state = State.ERROR;
}
break;
case State.Q:
this._fsachar.ReadChar(ch);
switch (this._fsachar.GetStatus()) {
case FSAStatus.END:
this._state = State.QC;
this._raw = this._fsachar.RetrieveRaw();
this._val = this._fsachar.RetrieveChar();
this._fsachar.Reset();
ReadChar(ch);
break;
case FSAStatus.ERROR:
this._state = State.ERROR;
break;
default:
break;
}
break;
case State.QC:
if (ch == '\'') {
this._state = State.QCQ;
} else {
this._state = State.ERROR;
}
break;
case State.QCQ:
this._state = State.END;
break;
default:
this._state = State.ERROR;
break;
}
}
public override void ReadEOF() {
if (this._state == State.QCQ) {
this._state = State.END;
} else {
this._state = State.ERROR;
}
}
}
}