2011-06-15 06:54:12 +00:00
|
|
|
/**
|
|
|
|
* Structures and functions for grouping lexemes into tokens. The tokenizer
|
|
|
|
* reads through an array of lexemes (generated by the lexer) and groups them
|
|
|
|
* into tokens based on their structure. In addition, some lexemes with
|
|
|
|
* semantic meaning (such as integers, floats, strings, and booleans) will have
|
|
|
|
* their values extracted and stored.
|
|
|
|
*
|
|
|
|
* \file tokenizer.h
|
|
|
|
*
|
|
|
|
* \author Justin J. Meza
|
|
|
|
*
|
2012-12-13 05:02:17 +00:00
|
|
|
* \date 2010-2012
|
2011-06-15 06:54:12 +00:00
|
|
|
*/
|
2010-08-09 07:01:59 +00:00
|
|
|
|
|
|
|
#ifndef __TOKENIZER_H__
|
|
|
|
#define __TOKENIZER_H__
|
|
|
|
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <string.h>
|
|
|
|
|
|
|
|
#include "lexer.h"
|
2012-03-24 15:58:04 +00:00
|
|
|
#include "error.h"
|
2010-08-09 07:01:59 +00:00
|
|
|
|
|
|
|
#undef DEBUG
|
|
|
|
|
2011-06-15 06:54:12 +00:00
|
|
|
/**
|
|
|
|
* Represents a token type. All of the token type names correspond to either
|
|
|
|
* the semantic type of token data or the lexemes which make up the particular
|
|
|
|
* token.
|
|
|
|
*
|
2012-03-24 15:58:04 +00:00
|
|
|
* \note Remember to update the keywords array (below) with the token image.
|
2011-06-15 06:54:12 +00:00
|
|
|
*/
|
2010-08-09 07:01:59 +00:00
|
|
|
typedef enum {
|
2011-06-15 06:54:12 +00:00
|
|
|
TT_INTEGER, /**< Integer literal. */
|
|
|
|
TT_FLOAT, /**< Decimal literal. */
|
|
|
|
TT_STRING, /**< String literal. */
|
|
|
|
TT_IDENTIFIER, /**< Identifier literal. */
|
|
|
|
TT_BOOLEAN, /**< Boolean literal. */
|
|
|
|
TT_IT, /**< \ref impvar "Implicit variable". */
|
2011-10-28 22:31:36 +00:00
|
|
|
TT_ITZLIEKA, /**< Inherited object declaration. */
|
2011-06-15 06:54:12 +00:00
|
|
|
TT_NOOB, /**< Nil keyword. */
|
|
|
|
TT_NUMBR, /**< Integer keyword. */
|
|
|
|
TT_NUMBAR, /**< Decimal keyword. */
|
|
|
|
TT_TROOF, /**< Boolean keyword. */
|
|
|
|
TT_YARN, /**< String keyword. */
|
|
|
|
TT_BUKKIT, /**< Array. */
|
|
|
|
TT_EOF, /**< End of file. */
|
|
|
|
TT_NEWLINE, /**< Newline. */
|
|
|
|
TT_HAI, /**< Beginning of main block. */
|
|
|
|
TT_KTHXBYE, /**< End of main block. */
|
|
|
|
TT_HASA, /**< Variable declaration. */
|
2015-08-16 03:19:36 +00:00
|
|
|
TT_HASAN, /**< Variable declaration. */
|
2011-06-15 06:54:12 +00:00
|
|
|
TT_ITZA, /**< Variable type initialization. */
|
|
|
|
TT_ITZ, /**< Variable value initialization. */
|
|
|
|
TT_RNOOB, /**< Deallocation. */
|
|
|
|
TT_R, /**< Assignment. */
|
|
|
|
TT_ANYR, /**< User-defined function argument separator. */
|
|
|
|
TT_AN, /**< Built-in function argument separator. */
|
|
|
|
TT_SUMOF, /**< Addition. */
|
|
|
|
TT_DIFFOF, /**< Subtraction. */
|
|
|
|
TT_PRODUKTOF, /**< Multiplication. */
|
|
|
|
TT_QUOSHUNTOF, /**< Division. */
|
|
|
|
TT_MODOF, /**< Modulo. */
|
|
|
|
TT_BIGGROF, /**< Greater than. */
|
|
|
|
TT_SMALLROF, /**< Less than. */
|
|
|
|
TT_BOTHOF, /**< Logical AND. */
|
|
|
|
TT_EITHEROF, /**< Logical OR. */
|
|
|
|
TT_WONOF, /**< Logical XOR. */
|
|
|
|
TT_NOT, /**< Logical NOT. */
|
|
|
|
TT_MKAY, /**< Infinite arity argument delimiter. */
|
|
|
|
TT_ALLOF, /**< Infinite arity logical AND. */
|
|
|
|
TT_ANYOF, /**< Infinite arity logical OR. */
|
|
|
|
TT_BOTHSAEM, /**< Equality. */
|
|
|
|
TT_DIFFRINT, /**< Inequality. */
|
|
|
|
TT_MAEK, /**< Cast. */
|
|
|
|
TT_A, /**< Cast target separator. */
|
|
|
|
TT_ISNOWA, /**< In-place cast. */
|
|
|
|
TT_VISIBLE, /**< Print. */
|
2015-08-13 22:24:26 +00:00
|
|
|
TT_INVISIBLE, /**< Print to standard error. */
|
2011-06-15 06:54:12 +00:00
|
|
|
TT_SMOOSH, /**< String concatenation. */
|
|
|
|
TT_BANG, /**< Exclamation point (!) */
|
|
|
|
TT_GIMMEH, /**< Input. */
|
|
|
|
TT_ORLY, /**< Conditional. */
|
|
|
|
TT_YARLY, /**< True branch. */
|
|
|
|
TT_MEBBE, /**< Else branch. */
|
|
|
|
TT_NOWAI, /**< False branch. */
|
|
|
|
TT_OIC, /**< Conditional and switch delimiter. */
|
|
|
|
TT_WTF, /**< Switch. */
|
|
|
|
TT_OMG, /**< Case. */
|
|
|
|
TT_OMGWTF, /**< Default case. */
|
|
|
|
TT_GTFO, /**< Break or return without value. */
|
|
|
|
TT_IMINYR, /**< Loop beginning. */
|
|
|
|
TT_UPPIN, /**< Auto increment loop variable. */
|
|
|
|
TT_NERFIN, /**< Auto decrement loop variable. */
|
|
|
|
TT_YR, /**< Function name delimiter. */
|
|
|
|
TT_TIL, /**< Do until. */
|
|
|
|
TT_WILE, /**< Do while. */
|
|
|
|
TT_IMOUTTAYR, /**< Loop ending. */
|
|
|
|
TT_HOWIZ, /**< Function definition beginning. */
|
|
|
|
TT_IZ, /**< Function scope delimiter. */
|
|
|
|
TT_IFUSAYSO, /**< Function definition end. */
|
|
|
|
TT_FOUNDYR, /**< Return with value. */
|
|
|
|
TT_SRS, /**< Indirect variable access. */
|
|
|
|
TT_APOSTROPHEZ, /**< Array slot access ('Z). */
|
2011-09-26 20:16:19 +00:00
|
|
|
TT_OHAIIM, /**< Alternate array declaration. */
|
2011-12-23 16:09:11 +00:00
|
|
|
TT_IMLIEK, /**< Alternate inherited object declaration. */
|
2011-09-26 20:16:19 +00:00
|
|
|
TT_KTHX, /**< End of alternate array declaration. */
|
2014-05-10 03:59:18 +00:00
|
|
|
TT_IDUZ, /**< System command. */
|
2013-02-24 20:44:33 +00:00
|
|
|
TT_CANHAS, /**< Library import declaration. */
|
|
|
|
TT_QUESTION, /**< End of library import declaration. */
|
2012-03-24 15:58:04 +00:00
|
|
|
|
|
|
|
TT_ENDOFTOKENS /**< Sentinel end of this enum -- don't move it! */
|
2010-08-09 07:01:59 +00:00
|
|
|
} TokenType;
|
|
|
|
|
2012-03-24 15:58:04 +00:00
|
|
|
static const char *keywords[] = {
|
|
|
|
"", /* TT_INTEGER */
|
|
|
|
"", /* TT_FLOAT */
|
|
|
|
"", /* TT_STRING */
|
|
|
|
"", /* TT_IDENTIFIER */
|
|
|
|
"", /* TT_BOOLEAN */
|
|
|
|
"IT", /* TT_IT */
|
|
|
|
"ITZ LIEK A", /* TT_ITZLIEKA */
|
|
|
|
"NOOB", /* TT_NOOB */
|
|
|
|
"NUMBR", /* TT_NUMBR */
|
|
|
|
"NUMBAR", /* TT_NUMBAR */
|
|
|
|
"TROOF", /* TT_TROOF */
|
|
|
|
"YARN", /* TT_YARN */
|
|
|
|
"BUKKIT", /* TT_BUKKIT */
|
|
|
|
"", /* TT_EOF */
|
|
|
|
"", /* TT_NEWLINE */
|
|
|
|
"HAI", /* TT_HAI */
|
|
|
|
"KTHXBYE", /* TT_KTHXBYE */
|
|
|
|
"HAS A", /* TT_HASA */
|
2015-08-16 03:19:36 +00:00
|
|
|
"HAS AN", /* TT_HASAN */
|
2012-03-24 15:58:04 +00:00
|
|
|
"ITZ A", /* TT_ITZA */
|
|
|
|
"ITZ", /* TT_ITZ */
|
|
|
|
"R NOOB", /* TT_RNOOB */
|
|
|
|
"R", /* TT_R */
|
|
|
|
"AN YR", /* TT_ANYR */
|
|
|
|
"AN", /* TT_AN */
|
|
|
|
"SUM OF", /* TT_SUMOF */
|
|
|
|
"DIFF OF", /* TT_DIFFOF */
|
|
|
|
"PRODUKT OF", /* TT_PRODUKTOF */
|
|
|
|
"QUOSHUNT OF", /* TT_QUOSHUNTOF */
|
|
|
|
"MOD OF", /* TT_MODOF */
|
|
|
|
"BIGGR OF", /* TT_BIGGROF */
|
|
|
|
"SMALLR OF", /* TT_SMALLROF */
|
|
|
|
"BOTH OF", /* TT_BOTHOF */
|
|
|
|
"EITHER OF", /* TT_EITHEROF */
|
|
|
|
"WON OF", /* TT_WONOF */
|
|
|
|
"NOT", /* TT_NOT */
|
|
|
|
"MKAY", /* TT_MKAY */
|
|
|
|
"ALL OF", /* TT_ALLOF */
|
|
|
|
"ANY OF", /* TT_ANYOF */
|
|
|
|
"BOTH SAEM", /* TT_BOTHSAEM */
|
|
|
|
"DIFFRINT", /* TT_DIFFRINT */
|
|
|
|
"MAEK", /* TT_MAEK */
|
|
|
|
"A", /* TT_A */
|
|
|
|
"IS NOW A", /* TT_ISNOWA */
|
|
|
|
"VISIBLE", /* TT_VISIBLE */
|
2015-08-13 22:24:26 +00:00
|
|
|
"INVISIBLE", /* TT_INVISIBLE */
|
2012-03-24 15:58:04 +00:00
|
|
|
"SMOOSH", /* TT_SMOOSH */
|
|
|
|
"!", /* TT_BANG */
|
|
|
|
"GIMMEH", /* TT_GIMMEH */
|
2013-02-27 02:35:54 +00:00
|
|
|
"O RLY", /* TT_ORLY */
|
2012-03-24 15:58:04 +00:00
|
|
|
"YA RLY", /* TT_YARLY */
|
|
|
|
"MEBBE", /* TT_MEBBE */
|
|
|
|
"NO WAI", /* TT_NOWAI */
|
|
|
|
"OIC", /* TT_OIC */
|
2013-02-27 02:35:54 +00:00
|
|
|
"WTF", /* TT_WTF */
|
2012-03-24 15:58:04 +00:00
|
|
|
"OMG", /* TT_OMG */
|
|
|
|
"OMGWTF", /* TT_OMGWTF */
|
|
|
|
"GTFO", /* TT_GTFO */
|
|
|
|
"IM IN YR", /* TT_IMINYR */
|
|
|
|
"UPPIN", /* TT_UPPIN */
|
|
|
|
"NERFIN", /* TT_NERFIN */
|
|
|
|
"YR", /* TT_YR */
|
|
|
|
"TIL", /* TT_TIL */
|
|
|
|
"WILE", /* TT_WILE */
|
|
|
|
"IM OUTTA YR", /* TT_IMOUTTAYR */
|
|
|
|
"HOW IZ", /* TT_HOWIZ */
|
|
|
|
"IZ", /* TT_IZ */
|
|
|
|
"IF U SAY SO", /* TT_IFUSAYSO */
|
|
|
|
"FOUND YR", /* TT_FOUNDYR */
|
|
|
|
"SRS", /* TT_SRS */
|
|
|
|
"'Z", /* TT_APOSTROPHEZ */
|
|
|
|
"O HAI IM", /* TT_OHAIIM */
|
|
|
|
"IM LIEK", /* TT_IMLIEK */
|
|
|
|
"KTHX", /* TT_KTHX */
|
2014-05-10 03:59:18 +00:00
|
|
|
"I DUZ", /* TT_IDUZ */
|
2013-02-24 20:44:33 +00:00
|
|
|
"CAN HAS", /* TT_CANHAS */
|
|
|
|
"?", /* TT_QUESTION */
|
2012-03-24 15:58:04 +00:00
|
|
|
"" /* TT_ENDOFTOKENS */
|
|
|
|
};
|
|
|
|
|
2011-06-15 06:54:12 +00:00
|
|
|
/**
|
|
|
|
* Stores token data with semantic meaning.
|
|
|
|
*/
|
2010-08-09 07:01:59 +00:00
|
|
|
typedef union {
|
2013-10-04 16:17:19 +00:00
|
|
|
long long i; /**< Integer data. */
|
2011-06-15 06:54:12 +00:00
|
|
|
float f; /**< Decimal data. */
|
2010-08-09 07:01:59 +00:00
|
|
|
} TokenData;
|
|
|
|
|
2011-06-15 06:54:12 +00:00
|
|
|
/**
|
|
|
|
* Stores a token type and any parsed values.
|
|
|
|
*/
|
2010-08-09 07:01:59 +00:00
|
|
|
typedef struct {
|
|
|
|
TokenType type; /**< The type of token. */
|
|
|
|
TokenData data; /**< The stored data of type \a type. */
|
2011-06-15 06:54:12 +00:00
|
|
|
char *image; /**< The characters that comprise the token. */
|
|
|
|
const char *fname; /**< The name of the file containing the token. */
|
|
|
|
unsigned int line; /**< The line number the token was on. */
|
2010-08-09 07:01:59 +00:00
|
|
|
} Token;
|
|
|
|
|
2011-06-15 06:54:12 +00:00
|
|
|
/**
|
|
|
|
* \name Utilities
|
|
|
|
*
|
|
|
|
* Functions for performing helper tasks.
|
|
|
|
*/
|
|
|
|
/**@{*/
|
2010-08-09 07:01:59 +00:00
|
|
|
int isInteger(const char *);
|
|
|
|
int isFloat(const char *);
|
|
|
|
int isString(const char *);
|
|
|
|
int isIdentifier(const char *);
|
2011-06-15 06:54:12 +00:00
|
|
|
Token *isKeyword(LexemeList *, unsigned int *);
|
|
|
|
/**@}*/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* \name Token modifiers
|
|
|
|
*
|
|
|
|
* Functions for creating and deleting tokens.
|
|
|
|
*/
|
|
|
|
/**@{*/
|
2010-08-09 07:01:59 +00:00
|
|
|
Token *createToken(TokenType, const char *, const char *, unsigned int);
|
|
|
|
void deleteToken(Token *);
|
2010-12-21 08:20:31 +00:00
|
|
|
int addToken(Token ***, unsigned int *, Token*);
|
2010-08-09 07:01:59 +00:00
|
|
|
void deleteTokens(Token **);
|
|
|
|
unsigned int acceptLexemes(LexemeList *, unsigned int, const char *);
|
2011-06-15 06:54:12 +00:00
|
|
|
/**@}*/
|
|
|
|
|
|
|
|
/**
|
|
|
|
* \name Lexeme tokenizer
|
|
|
|
*
|
|
|
|
* Generates tokens from lexemes.
|
|
|
|
*/
|
|
|
|
/**@{*/
|
2010-08-09 07:01:59 +00:00
|
|
|
Token **tokenizeLexemes(LexemeList *);
|
2011-06-15 06:54:12 +00:00
|
|
|
/**@}*/
|
2010-08-09 07:01:59 +00:00
|
|
|
|
|
|
|
#endif /* __TOKENIZER_H__ */
|